[discuss] [PATCH] AMD Family 10h revision B2 Erratum 298 and L2 Eviction Bug Workaround

Joerg Roedel joerg.roedel at amd.com
Wed Dec 5 19:31:13 CET 2007


This patch is a workaround for AMD erratum 298. Due to the very invasive
nature of this patch and the very small number of affected customers
(you know it if you have an affected part), we do not recommend the use
of this patch on a regular Linux system. This patch is NOT intended for
mainline acceptance or inclusion with a Linux distribution!  The patch
has only received minimal functional testing. Every user must evaluate
it prior to production use to make sure it meets the necessary quality
standards. Like all GPL software, this patch comes with absolutely no
warranty.

-- 
           |           AMD Saxony Limited Liability Company & Co. KG
 Operating |         Wilschdorfer Landstr. 101, 01109 Dresden, Germany
 System    |                  Register Court Dresden: HRA 4896
 Research  |              General Partner authorized to represent:
 Center    |             AMD Saxony LLC (Wilmington, Delaware, US)
           | General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
-------------- next part --------------
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index b6167fe..40982c2 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -95,7 +95,8 @@ startup_64:
 	movq	%rdi, %rax
 	shrq	$PMD_SHIFT, %rax
 	andq	$(PTRS_PER_PMD - 1), %rax
-	leaq	__PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+	movq    $__PAGE_KERNEL_LARGE_EXEC, %rdx
+	orq	%rdi, %rdx
 	leaq	level2_spare_pgt(%rip), %rbx
 	movq	%rdx, 0(%rbx, %rax, 8)
 ident_complete:
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index af838f6..0b53fde 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -617,6 +617,21 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	/* Family 10 doesn't support C states in MWAIT so don't use it */
 	if (c->x86 == 0x10 && !force_mwait)
 		clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
+
+	/* enable workaround for AMD Erratum 298 if necessary */ 
+	if ((c->x86 == 0x10) && (c->x86_model < 3) && (c->x86_mask != 3)) {
+		/* re-enable TLB caching if BIOS disabled it */
+		rdmsrl(MSR_K8_HWCR, value);
+		value &= ~(1UL << 3);
+		wrmsrl(MSR_K8_HWCR, value);
+		rdmsrl(0xC0011023, value);
+		value &= ~(1UL << 1);
+		wrmsrl(0xC0011023, value);
+		/* enable OS workaround */
+		e298_bug = 1;
+		printk(KERN_INFO "AMD erratum 298 workaround enabled\n");
+	}
+
 }
 
 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 54816ad..ea7fa4e 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -187,6 +187,40 @@ bad:
 	printk("BAD\n");
 }
 
+pte_t* get_pte(unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = (pgd_t *)read_cr3();
+
+	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+	pgd += pgd_index(address);
+	if (!pgd_present(*pgd))
+		goto ret;
+
+	pud = pud_offset(pgd, address);
+	if (bad_address(pud) || !pud_present(*pud))
+		goto ret;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd) || !pmd_present(*pmd))
+		goto ret;
+	if (pmd_large(*pmd))
+		return (pte_t*)pmd;
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte))
+		goto ret;
+
+	return pte;
+ret:
+	return NULL;
+}
+
+
 static const char errata93_warning[] = 
 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
@@ -293,6 +327,7 @@ static int vmalloc_fault(unsigned long address)
 
 static int page_fault_trace;
 int show_unhandled_signals = 1;
+unsigned e298_bug = 0;
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -310,6 +345,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	int write, fault;
 	unsigned long flags;
 	siginfo_t info;
+	pte_t *pte;
 
 	tsk = current;
 	mm = tsk->mm;
@@ -318,6 +354,45 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	/* get the address */
 	address = read_cr2();
 
+	if (!e298_bug)
+		goto emul_ready;
+
+	pte = get_pte(address);
+	if (pte != NULL) {
+
+		pte_t old_entry = *pte;
+		pte_t entry = old_entry;
+
+		if (!(error_code & PF_PROT) &&
+				(pte_val(entry) & _PAGE_E_PRESENT) &&
+				!(pte_val(entry) & _PAGE_PRESENT) &&
+				!(pte_val(entry) & _PAGE_E_ACCESSED)) {
+
+			entry = __pte(pte_val(entry) | _PAGE_E_ACCESSED);
+			entry = __pte(pte_val(entry) | _PAGE_PRESENT);
+			/* avoid possible race condition by using cmpxchg instead of set_pte */
+			cmpxchg(&pte_val(*pte), pte_val(old_entry), pte_val(entry));
+
+			return;
+		}
+
+		if ( (error_code & PF_WRITE) &&
+				(pte_val(entry) & _PAGE_E_PRESENT) &&
+				(pte_val(entry) & _PAGE_E_RW) &&
+				!(pte_val(entry) & _PAGE_RW) &&
+				!(pte_val(entry) & _PAGE_E_DIRTY)) {
+
+			entry = __pte(pte_val(entry) | _PAGE_E_DIRTY);
+			entry = __pte(pte_val(entry) | _PAGE_RW);
+			/* avoid possible race condition by using cmpxchg instead of set_pte */
+			cmpxchg(&pte_val(*pte), pte_val(old_entry), pte_val(entry));
+
+			return;
+		}
+	}
+
+emul_ready:
+
 	info.si_code = SEGV_MAPERR;
 
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e2d6bad..ca421dd 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -268,7 +268,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 		if (pmd_val(*pmd))
 			continue;
 
-		entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+		entry = address|_PAGE_GLOBAL|__PAGE_KERNEL_LARGE;
 		entry &= __supported_pte_mask;
 		set_pmd(pmd, __pmd(entry));
 	}
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 6cac90a..9a67fc6 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -103,8 +103,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l
 	}
 #endif
 
-	pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
-			  | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
+	pgprot = __pgprot(__PAGE_KERNEL_EXEC | _PAGE_GLOBAL | flags);
 	/*
 	 * Mappings have to be page-aligned
 	 */
diff --git a/arch/x86_64/vdso/vdso.lds.S b/arch/x86_64/vdso/vdso.lds.S
index b9a60e6..4bf9720 100644
--- a/arch/x86_64/vdso/vdso.lds.S
+++ b/arch/x86_64/vdso/vdso.lds.S
@@ -28,7 +28,7 @@ SECTIONS
 
   .text           : { *(.text) }		:text
   .text.ptr       : { *(.text.ptr) }		:text
-  . = VDSO_PRELINK + 0x900;
+  . = VDSO_PRELINK + 0x908;
   .data           : { *(.data) }		:text
   .bss            : { *(.bss) }			:text
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 57dd6b3..f69d82c 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -13,6 +13,8 @@
 #include <linux/threads.h>
 #include <asm/pda.h>
 
+extern unsigned e298_bug;
+
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
@@ -71,10 +73,6 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
 #define pgd_none(x)	(!pgd_val(x))
 #define pud_none(x)	(!pud_val(x))
 
-static inline void set_pte(pte_t *dst, pte_t val)
-{
-	pte_val(*dst) = pte_val(val);
-} 
 #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
 
 static inline void set_pmd(pmd_t *dst, pmd_t val)
@@ -150,6 +148,10 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define _PAGE_BIT_DIRTY		6
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+#define _PAGE_BIT_E_RW          9
+#define _PAGE_BIT_E_DIRTY      10
+#define _PAGE_BIT_E_ACCESSED   61
+#define _PAGE_BIT_E_PRESENT    62
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 
 #define _PAGE_PRESENT	0x001
@@ -162,35 +164,33 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 #define _PAGE_PSE	0x080	/* 2MB page */
 #define _PAGE_FILE	0x040	/* nonlinear file mapping, saved PTE; unset:swap */
 #define _PAGE_GLOBAL	0x100	/* Global TLB entry */
+#define _PAGE_E_RW      0x200
+#define _PAGE_E_DIRTY   0x400
 
 #define _PAGE_PROTNONE	0x080	/* If not present */
+#define _PAGE_E_PRESENT  (_AC(1,UL)<<_PAGE_BIT_E_PRESENT)
+#define _PAGE_E_ACCESSED (_AC(1,UL)<<_PAGE_BIT_E_ACCESSED)
 #define _PAGE_NX        (_AC(1,UL)<<_PAGE_BIT_NX)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
-#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_E_ACCESSED | _PAGE_E_DIRTY)
 
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_SHARED      __pgprot(_PAGE_E_PRESENT | _PAGE_E_RW | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_NX | _PAGE_PRESENT | _PAGE_ACCESSED)
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_E_PRESENT | _PAGE_E_RW | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_PRESENT | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_E_PRESENT | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_NX | _PAGE_PRESENT | _PAGE_ACCESSED)
 #define PAGE_COPY PAGE_COPY_NOEXEC
-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define __PAGE_KERNEL \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
-#define __PAGE_KERNEL_EXEC \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
-#define __PAGE_KERNEL_NOCACHE \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX)
-#define __PAGE_KERNEL_RO \
-	(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
-#define __PAGE_KERNEL_VSYSCALL \
-	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE \
-	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_E_PRESENT | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_PRESENT | _PAGE_ACCESSED)
+#define PAGE_READONLY   __pgprot(_PAGE_E_PRESENT | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_NX | _PAGE_PRESENT | _PAGE_ACCESSED)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_E_PRESENT | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_PRESENT | _PAGE_ACCESSED)
+#define __PAGE_KERNEL                  (_PAGE_E_PRESENT | _PAGE_E_RW | _PAGE_E_DIRTY | _PAGE_E_ACCESSED | _PAGE_NX | _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define __PAGE_KERNEL_EXEC             (_PAGE_E_PRESENT | _PAGE_E_RW | _PAGE_E_DIRTY | _PAGE_E_ACCESSED | _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define __PAGE_KERNEL_NOCACHE          (_PAGE_E_PRESENT | _PAGE_E_RW | _PAGE_E_DIRTY | _PAGE_PCD | _PAGE_E_ACCESSED | _PAGE_NX | _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define __PAGE_KERNEL_RO               (_PAGE_E_PRESENT | _PAGE_E_DIRTY | _PAGE_E_ACCESSED | _PAGE_NX | _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define __PAGE_KERNEL_VSYSCALL         (_PAGE_E_PRESENT | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define __PAGE_KERNEL_VSYSCALL_NOCACHE (_PAGE_E_PRESENT | _PAGE_USER | _PAGE_E_ACCESSED | _PAGE_PCD | _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define __PAGE_KERNEL_LARGE \
 	(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC \
@@ -228,6 +228,34 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
 
 #ifndef __ASSEMBLY__
 
+static inline void set_pte(pte_t *dst, pte_t pte)
+{
+	if (!(pte_val(pte) & _PAGE_E_PRESENT)) {                                                   
+		pte_val(pte) &= ~_PAGE_PRESENT;
+		goto set;
+	}
+	if (e298_bug) {
+		pte_val(pte) |= (_PAGE_ACCESSED | _PAGE_DIRTY);
+		if (!(pte_val(pte) & _PAGE_E_DIRTY) || !(pte_val(pte) & _PAGE_E_RW))
+			pte_val(pte) &= ~_PAGE_RW;
+		if ((pte_val(pte) & _PAGE_E_DIRTY) && (pte_val(pte) & _PAGE_E_RW))
+			pte_val(pte) |= _PAGE_RW;
+		if (!(pte_val(pte) & _PAGE_E_ACCESSED) || !(pte_val(pte) & _PAGE_E_PRESENT))
+			pte_val(pte) &= ~_PAGE_PRESENT;
+		if ((pte_val(pte) & _PAGE_E_ACCESSED) && (pte_val(pte) & _PAGE_E_PRESENT))
+			pte_val(pte) |= _PAGE_PRESENT;
+	} else {
+		if (!(pte_val(pte) & _PAGE_E_RW)) pte_val(pte) &= ~_PAGE_RW;
+		else pte_val(pte) |= _PAGE_RW;
+
+		if (!(pte_val(pte) & _PAGE_E_PRESENT)) pte_val(pte) &= ~_PAGE_PRESENT;
+		else pte_val(pte) |= _PAGE_PRESENT;
+	}
+set:
+	pte_val(*dst) = pte_val(pte);
+
+}
+
 static inline unsigned long pgd_bad(pgd_t pgd)
 {
 	return pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
@@ -244,11 +272,10 @@ static inline unsigned long pmd_bad(pmd_t pmd)
 }
 
 #define pte_none(x)	(!pte_val(x))
-#define pte_present(x)	(pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
+#define pte_present(x)	(pte_val(x) & (_PAGE_E_PRESENT | _PAGE_PROTNONE))
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 
-#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))	/* FIXME: is this
-						   right? */
+#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))	/* FIXME: is this right? */
 #define pte_page(x)	pfn_to_page(pte_pfn(x))
 #define pte_pfn(x)  ((pte_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
 
@@ -266,19 +293,19 @@ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
  * Undefined behaviour if not..
  */
 #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
-static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
-static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
+static inline int pte_dirty(pte_t pte)          { return pte_val(pte) & (e298_bug ? _PAGE_E_DIRTY : _PAGE_DIRTY); }
+static inline int pte_young(pte_t pte)          { return pte_val(pte) & (e298_bug ? _PAGE_E_ACCESSED : _PAGE_ACCESSED); }
 static inline int pte_write(pte_t pte)		{ return pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
 static inline int pte_huge(pte_t pte)		{ return pte_val(pte) & _PAGE_PSE; }
 
-static inline pte_t pte_mkclean(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_DIRTY)); return pte; }
-static inline pte_t pte_mkold(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; }
-static inline pte_t pte_wrprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW)); return pte; }
+static inline pte_t pte_mkclean(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~(_PAGE_DIRTY|_PAGE_E_DIRTY))); return pte; }
+static inline pte_t pte_mkold(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~(_PAGE_ACCESSED|_PAGE_E_ACCESSED))); return pte; }
+static inline pte_t pte_wrprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~(_PAGE_RW|_PAGE_E_RW))); return pte; }
 static inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_NX)); return pte; }
-static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
-static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
-static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
+static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | (_PAGE_DIRTY|_PAGE_E_DIRTY))); return pte; }
+static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | (_PAGE_ACCESSED|_PAGE_E_ACCESSED))); return pte; }
+static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | (_PAGE_RW|_PAGE_E_RW))); return pte; }
 static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
 static inline pte_t pte_clrhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_PSE)); return pte; }
 
@@ -286,14 +313,30 @@ struct vm_area_struct;
 
 static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
+	int oldbit;
+	pte_t old_entry, entry;
+
 	if (!pte_young(*ptep))
 		return 0;
-	return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
+again:
+	/* retry loop to fix possible race conditions with emulation code */
+	old_entry = *ptep;
+	entry = old_entry;
+	oldbit = test_and_clear_bit(e298_bug ? _PAGE_BIT_E_ACCESSED : _PAGE_BIT_ACCESSED, &entry);
+	if (e298_bug)
+		clear_bit(_PAGE_BIT_PRESENT, &entry);
+	if (cmpxchg(&pte_val(*ptep), pte_val(old_entry), pte_val(entry)) != pte_val(old_entry))
+		goto again;
+
+	return oldbit;
 }
 
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	clear_bit(_PAGE_BIT_RW, &ptep->pte);
+	__asm__ __volatile__ (LOCK_PREFIX "andq %1, %0  \n"
+			:
+			: "m" (*ptep), "i" (~((_PAGE_RW | _PAGE_E_RW)))
+			: "memory");
 }
 
 /*
@@ -392,9 +435,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 })
 
 /* Encode and de-code a swap entry */
+#define __SWAP_ENTRY_MASK (~(_PAGE_NX | _PAGE_E_PRESENT))
 #define __swp_type(x)			(((x).val >> 1) & 0x3f)
 #define __swp_offset(x)			((x).val >> 8)
-#define __swp_entry(type, offset)	((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
+#define __swp_entry(type, offset)       ((swp_entry_t) { (__SWAP_ENTRY_MASK & (((type) << 1) | ((offset) << 8))) })
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 


More information about the discuss mailing list