Blame SOURCES/mcelog-haswell-support.patch

09c1d0
The patches were in the process of being committed to Andi's upstream mcelog
09c1d0
tree when they were applied to the RHEL source.  The patch subjects are
09c1d0
09c1d0
	Add better decoding support for Haswell server processors
09c1d0
	More compact data structures for reporting SNB/IVB memory controller errors
09c1d0
09c1d0
and were provided early by Tony Luck @ Intel.
09c1d0
09c1d0
diff -urNp mcelog-d2e13bf0.orig/haswell.c mcelog-d2e13bf0/haswell.c
09c1d0
--- mcelog-d2e13bf0.orig/haswell.c	1969-12-31 19:00:00.000000000 -0500
09c1d0
+++ mcelog-d2e13bf0/haswell.c	2014-09-08 09:59:52.998327718 -0400
09c1d0
@@ -0,0 +1,150 @@
09c1d0
+/* Copyright (C) 2013 Intel Corporation
09c1d0
+   Decode Intel Ivy Bridge specific machine check errors.
09c1d0
+
09c1d0
+   mcelog is free software; you can redistribute it and/or
09c1d0
+   modify it under the terms of the GNU General Public
09c1d0
+   License as published by the Free Software Foundation; version
09c1d0
+   2.
09c1d0
+
09c1d0
+   mcelog is distributed in the hope that it will be useful,
09c1d0
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
09c1d0
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
09c1d0
+   General Public License for more details.
09c1d0
+
09c1d0
+   You should find a copy of v2 of the GNU General Public License somewhere
09c1d0
+   on your Linux system; if not, write to the Free Software Foundation,
09c1d0
+   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
09c1d0
+
09c1d0
+   Author: Tony Luck
09c1d0
+*/
09c1d0
+
09c1d0
+#include "mcelog.h"
09c1d0
+#include "bitfield.h"
09c1d0
+#include "haswell.h"
09c1d0
+#include "memdb.h"
09c1d0
+
09c1d0
+/* See IA32 SDM Vol3B Table 16-20 */
09c1d0
+
09c1d0
+static char *pcu_1[] = {
09c1d0
+	[0x00] = "No Error",
09c1d0
+	[0x09] = "MC_MESSAGE_CHANNEL_TIMEOUT",
09c1d0
+	[0x0D] = "MC_IMC_FORCE_SR_S3_TIMEOUT",
09c1d0
+	[0x0E] = "MC_CPD_UNCPD_SD_TIMEOUT",
09c1d0
+	[0x13] = "MC_DMI_TRAINING_TIMEOUT",
09c1d0
+	[0x15] = "MC_DMI_CPU_RESET_ACK_TIMEOUT",
09c1d0
+	[0x1E] = "MC_VR_ICC_MAX_LT_FUSED_ICC_MAX",
09c1d0
+	[0x25] = "MC_SVID_COMMAN_TIMEOUT",
09c1d0
+	[0x29] = "MC_VR_VOUT_MAC_LT_FUSED_SVID",
09c1d0
+	[0x2B] = "MC_PKGC_WATCHDOG_HANG_CBZ_DOWN",
09c1d0
+	[0x2C] = "MC_PKGC_WATCHDOG_HANG_CBZ_UP",
09c1d0
+	[0x39] = "MC_PKGC_WATCHDOG_HANG_C3_UP_SF",
09c1d0
+	[0x44] = "MC_CRITICAL_VR_FAILED",
09c1d0
+	[0x45] = "MC_ICC_MAX_NOTSUPPORTED",
09c1d0
+	[0x46] = "MC_VID_RAMP_DOWN_FAILED",
09c1d0
+	[0x47] = "MC_EXCL_MODE_NO_PMREQ_CMP",
09c1d0
+	[0x48] = "MC_SVID_READ_REG_ICC_MAX_FAILED",
09c1d0
+	[0x49] = "MC_SVID_WRITE_REG_VOUT_MAX_FAILED",
09c1d0
+	[0x4B] = "MC_BOOT_VID_TIMEOUT_DRAM_0",
09c1d0
+	[0x4C] = "MC_BOOT_VID_TIMEOUT_DRAM_1",
09c1d0
+	[0x4D] = "MC_BOOT_VID_TIMEOUT_DRAM_2",
09c1d0
+	[0x4E] = "MC_BOOT_VID_TIMEOUT_DRAM_3",
09c1d0
+	[0x4F] = "MC_SVID_COMMAND_ERROR",
09c1d0
+	[0x52] = "MC_FIVR_CATAS_OVERVOL_FAULT",
09c1d0
+	[0x53] = "MC_FIVR_CATAS_OVERCUR_FAULT",
09c1d0
+	[0x57] = "MC_SVID_PKGC_REQUEST_FAILED",
09c1d0
+	[0x58] = "MC_SVID_IMON_REQUEST_FAILED",
09c1d0
+	[0x59] = "MC_SVID_ALERT_REQUEST_FAILED",
09c1d0
+	[0x60] = "MC_INVALID_PKGS_REQ_PCH",
09c1d0
+	[0x61] = "MC_INVALID_PKGS_REQ_QPI",
09c1d0
+	[0x62] = "MC_INVALID_PKGS_RSP_QPI",
09c1d0
+	[0x63] = "MC_INVALID_PKGS_RSP_PCH",
09c1d0
+	[0x64] = "MC_INVALID_PKG_STATE_CONFIG",
09c1d0
+	[0x67] = "MC_HA_IMC_RW_BLOCK_ACK_TIMEOUT",
09c1d0
+	[0x68] = "MC_IMC_RW_SMBUS_TIMEOUT",
09c1d0
+	[0x69] = "MC_HA_FAILSTS_CHANGE_DETECTED",
09c1d0
+	[0x6A] = "MC_MSGCH_PMREQ_CMP_TIMEOUT",
09c1d0
+	[0x70] = "MC_WATCHDOG_TIMEOUT_PKGC_SLAVE",
09c1d0
+	[0x71] = "MC_WATCHDOG_TIMEOUT_PKGC_MASTER",
09c1d0
+	[0x72] = "MC_WATCHDOG_TIMEOUT_PKGS_MASTER",
09c1d0
+	[0x7C] = "MC_BIOS_RST_CPL_INVALID_SEQ",
09c1d0
+	[0x7D] = "MC_MORE_THAN_ONE_TXT_AGENT",
09c1d0
+	[0x81] = "MC_RECOVERABLE_DIE_THERMAL_TOO_HOT"
09c1d0
+};
09c1d0
+
09c1d0
+static struct field pcu_mc4[] = {
09c1d0
+	FIELD(24, pcu_1),
09c1d0
+	{}
09c1d0
+};
09c1d0
+
09c1d0
+/* See IA32 SDM Vol3B Table 16-21 */
09c1d0
+
09c1d0
+static char *qpi[] = {
09c1d0
+	[0x02] = "Intel QPI physical layer detected drift buffer alarm",
09c1d0
+	[0x03] = "Intel QPI physical layer detected latency buffer rollover",
09c1d0
+	[0x10] = "Intel QPI link layer detected control error from R3QPI",
09c1d0
+	[0x11] = "Rx entered LLR abort state on CRC error",
09c1d0
+	[0x12] = "Unsupported or undefined packet",
09c1d0
+	[0x13] = "Intel QPI link layer control error",
09c1d0
+	[0x15] = "RBT used un-initialized value",
09c1d0
+	[0x20] = "Intel QPI physical layer detected a QPI in-band reset but aborted initialization",
09c1d0
+	[0x21] = "Link failover data self healing",
09c1d0
+	[0x22] = "Phy detected in-band reset (no width change)",
09c1d0
+	[0x23] = "Link failover clock failover",
09c1d0
+	[0x30] = "Rx detected CRC error - successful LLR after Phy re-init",
09c1d0
+	[0x31] = "Rx detected CRC error - successful LLR wihout Phy re-init",
09c1d0
+};
09c1d0
+
09c1d0
+static struct field qpi_mc[] = {
09c1d0
+	FIELD(16, qpi),
09c1d0
+	{}
09c1d0
+};
09c1d0
+
09c1d0
+/* See IA32 SDM Vol3B Table 16-22 */
09c1d0
+
09c1d0
+static struct field memctrl_mc9[] = {
09c1d0
+	SBITFIELD(16, "DDR3 address parity error"),
09c1d0
+	SBITFIELD(17, "Uncorrected HA write data error"),
09c1d0
+	SBITFIELD(18, "Uncorrected HA data byte enable error"),
09c1d0
+	SBITFIELD(19, "Corrected patrol scrub error"),
09c1d0
+	SBITFIELD(20, "Uncorrected patrol scrub error"),
09c1d0
+	SBITFIELD(21, "Corrected spare error"),
09c1d0
+	SBITFIELD(22, "Uncorrected spare error"),
09c1d0
+	SBITFIELD(23, "Corrected memory read error"),
09c1d0
+	SBITFIELD(24, "iMC write data buffer parity error"),
09c1d0
+	SBITFIELD(25, "DDR4 command address parity error"),
09c1d0
+	{}
09c1d0
+};
09c1d0
+
09c1d0
+void hsw_decode_model(int cputype, int bank, u64 status, u64 misc)
09c1d0
+{
09c1d0
+	switch (bank) {
09c1d0
+	case 4:
09c1d0
+		Wprintf("PCU: ");
09c1d0
+		switch (EXTRACT(status, 0, 15) & ~(1ull << 12)) {
09c1d0
+		case 0x402: case 0x403:
09c1d0
+			Wprintf("Internal errors ");
09c1d0
+			break;
09c1d0
+		case 0x406:
09c1d0
+			Wprintf("Intel TXT errors ");
09c1d0
+			break;
09c1d0
+		case 0x407:
09c1d0
+			Wprintf("Other UBOX Internal errors ");
09c1d0
+			break;
09c1d0
+		}
09c1d0
+		if (EXTRACT(status, 16, 19))
09c1d0
+			Wprintf("PCU internal error ");
09c1d0
+		decode_bitfield(status, pcu_mc4);
09c1d0
+		break;
09c1d0
+	case 5:
09c1d0
+	case 20:
09c1d0
+	case 21:
09c1d0
+		Wprintf("QPI: ");
09c1d0
+		decode_bitfield(status, qpi_mc);
09c1d0
+		break;
09c1d0
+	case 9: case 10: case 11: case 12:
09c1d0
+	case 13: case 14: case 15: case 16:
09c1d0
+		Wprintf("MemCtrl: ");
09c1d0
+		decode_bitfield(status, memctrl_mc9);
09c1d0
+		break;
09c1d0
+	}
09c1d0
+}
09c1d0
diff -urNp mcelog-d2e13bf0.orig/haswell.h mcelog-d2e13bf0/haswell.h
09c1d0
--- mcelog-d2e13bf0.orig/haswell.h	1969-12-31 19:00:00.000000000 -0500
09c1d0
+++ mcelog-d2e13bf0/haswell.h	2014-09-08 09:59:52.998327718 -0400
09c1d0
@@ -0,0 +1,2 @@
09c1d0
+void hsw_decode_model(int cputype, int bank, u64 status, u64 misc);
09c1d0
+void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm);
09c1d0
diff -urNp mcelog-d2e13bf0.orig/intel.c mcelog-d2e13bf0/intel.c
09c1d0
--- mcelog-d2e13bf0.orig/intel.c	2014-09-08 09:59:39.622699389 -0400
09c1d0
+++ mcelog-d2e13bf0/intel.c	2014-09-08 09:59:52.998327718 -0400
09c1d0
@@ -24,6 +24,7 @@
09c1d0
 #include "page.h"
09c1d0
 #include "sandy-bridge.h"
09c1d0
 #include "ivy-bridge.h"
09c1d0
+#include "haswell.h"
09c1d0
 #include "xeon75xx.h"
09c1d0
 
09c1d0
 int memory_error_support;
09c1d0
@@ -33,7 +34,7 @@ void intel_cpu_init(enum cputype cpu)
09c1d0
 	if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL ||
09c1d0
 	    cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP ||
09c1d0
 	    cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX ||
09c1d0
-	    cpu == CPU_HASWELL)
09c1d0
+	    cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX)
09c1d0
 		memory_error_support = 1;
09c1d0
 }
09c1d0
 
09c1d0
@@ -67,9 +68,10 @@ enum cputype select_intel_cputype(int fa
09c1d0
 			return CPU_IVY_BRIDGE;
09c1d0
 		else if (model == 0x3e)
09c1d0
 			return CPU_IVY_BRIDGE_EPEX;
09c1d0
-		else if (model == 0x3c || model == 0x3f || model == 0x45 ||
09c1d0
-			 model == 0x46)
09c1d0
+		else if (model == 0x3c || model == 0x45 || model == 0x46)
09c1d0
 			return CPU_HASWELL;
09c1d0
+		else if (model == 0x3f)
09c1d0
+			return CPU_HASWELL_EPEX;
09c1d0
 		if (model > 0x1a) {
09c1d0
 			Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n",
09c1d0
 				model);
09c1d0
diff -urNp mcelog-d2e13bf0.orig/intel.h mcelog-d2e13bf0/intel.h
09c1d0
--- mcelog-d2e13bf0.orig/intel.h	2014-09-08 09:59:39.621699344 -0400
09c1d0
+++ mcelog-d2e13bf0/intel.h	2014-09-08 09:59:52.998327718 -0400
09c1d0
@@ -18,5 +18,6 @@ extern int memory_error_support;
09c1d0
 	case CPU_SANDY_BRIDGE: \
09c1d0
 	case CPU_IVY_BRIDGE: \
09c1d0
 	case CPU_IVY_BRIDGE_EPEX: \
09c1d0
-	case CPU_HASWELL
09c1d0
+	case CPU_HASWELL: \
09c1d0
+	case CPU_HASWELL_EPEX
09c1d0
 
09c1d0
diff -urNp mcelog-d2e13bf0.orig/ivy-bridge.c mcelog-d2e13bf0/ivy-bridge.c
09c1d0
--- mcelog-d2e13bf0.orig/ivy-bridge.c	2014-09-08 09:59:39.621699344 -0400
09c1d0
+++ mcelog-d2e13bf0/ivy-bridge.c	2014-09-08 09:59:56.033470497 -0400
09c1d0
@@ -68,20 +68,16 @@ static struct field pcu_mc4[] = {
09c1d0
 
09c1d0
 /* See IA32 SDM Vol3B Table 16-18 */
09c1d0
 
09c1d0
-static char *memctrl_1[] = {
09c1d0
-	[0x001] = "Address parity error",
09c1d0
-	[0x002] = "HA Wrt buffer Data parity error",
09c1d0
-	[0x004] = "HA Wrt byte enable parity error",
09c1d0
-	[0x008] = "Corrected patrol scrub error",
09c1d0
-	[0x010] = "Uncorrected patrol scrub error",
09c1d0
-	[0x020] = "Corrected spare error",
09c1d0
-	[0x040] = "Uncorrected spare error",
09c1d0
-	[0x080] = "Corrected memory read error",
09c1d0
-	[0x100] = "iMC, WDB, parity errors",
09c1d0
-};
09c1d0
-
09c1d0
 static struct field memctrl_mc9[] = {
09c1d0
-	FIELD(16, memctrl_1),
09c1d0
+	SBITFIELD(16, "Address parity error"),
09c1d0
+	SBITFIELD(17, "HA Wrt buffer Data parity error"),
09c1d0
+	SBITFIELD(18, "HA Wrt byte enable parity error"),
09c1d0
+	SBITFIELD(19, "Corrected patrol scrub error"),
09c1d0
+	SBITFIELD(20, "Uncorrected patrol scrub error"),
09c1d0
+	SBITFIELD(21, "Corrected spare error"),
09c1d0
+	SBITFIELD(22, "Uncorrected spare error"),
09c1d0
+	SBITFIELD(23, "Corrected memory read error"),
09c1d0
+	SBITFIELD(24, "iMC, WDB, parity errors"),
09c1d0
 	{}
09c1d0
 };
09c1d0
 
09c1d0
diff -urNp mcelog-d2e13bf0.orig/Makefile mcelog-d2e13bf0/Makefile
09c1d0
--- mcelog-d2e13bf0.orig/Makefile	2014-09-08 09:59:39.610698703 -0400
09c1d0
+++ mcelog-d2e13bf0/Makefile	2014-09-08 09:59:52.998327718 -0400
09c1d0
@@ -32,7 +32,7 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co
09c1d0
        nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o   \
09c1d0
        eventloop.o leaky-bucket.o memdb.o server.o trigger.o 	 \
09c1d0
        client.o cache.o sysfs.o yellow.o page.o rbtree.o 	 \
09c1d0
-       xeon75xx.o sandy-bridge.o ivy-bridge.o msr.o
09c1d0
+       xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o msr.o
09c1d0
 DISKDB_OBJ := diskdb.o dimm.o db.o
09c1d0
 CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ}
09c1d0
 DOC := mce.pdf
09c1d0
diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c
09c1d0
--- mcelog-d2e13bf0.orig/mcelog.c	2014-09-08 09:59:39.622699389 -0400
09c1d0
+++ mcelog-d2e13bf0/mcelog.c	2014-09-08 09:59:52.999327768 -0400
09c1d0
@@ -228,6 +228,7 @@ static char *cputype_name[] = {
09c1d0
 	[CPU_IVY_BRIDGE] = "Ivy Bridge", /* Fill in better name */
09c1d0
 	[CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */
09c1d0
 	[CPU_HASWELL] = "Haswell", /* Fill in better name */
09c1d0
+	[CPU_HASWELL_EPEX] = "Haswell EP/EX", /* Fill in better name */
09c1d0
 };
09c1d0
 
09c1d0
 static struct config_choice cpu_choices[] = {
09c1d0
@@ -264,6 +265,8 @@ static struct config_choice cpu_choices[
09c1d0
 	{ "ivybridge-ep", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */
09c1d0
 	{ "ivybridge-ex", CPU_IVY_BRIDGE_EPEX }, /* Fill in better name */
09c1d0
 	{ "haswell", CPU_HASWELL }, /* Fill in better name */
09c1d0
+	{ "haswell-ep", CPU_HASWELL_EPEX }, /* Fill in better name */
09c1d0
+	{ "haswell-ex", CPU_HASWELL_EPEX }, /* Fill in better name */
09c1d0
 	{}
09c1d0
 };
09c1d0
 
09c1d0
@@ -424,7 +427,8 @@ static void dump_mce(struct mce *m, unsi
09c1d0
 			fam,
09c1d0
 			mod);
09c1d0
 	}
09c1d0
-	if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX)
09c1d0
+	if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX &&
09c1d0
+	    cputype != CPU_HASWELL_EPEX)
09c1d0
 		resolveaddr(m->addr);
09c1d0
 	if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) {
09c1d0
 		diskdb_resolve_addr(m->addr);
09c1d0
diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h
09c1d0
--- mcelog-d2e13bf0.orig/mcelog.h	2014-09-08 09:59:39.621699344 -0400
09c1d0
+++ mcelog-d2e13bf0/mcelog.h	2014-09-08 09:59:52.999327768 -0400
09c1d0
@@ -118,6 +118,7 @@ enum cputype {
09c1d0
 	CPU_IVY_BRIDGE, 
09c1d0
 	CPU_IVY_BRIDGE_EPEX, 
09c1d0
 	CPU_HASWELL,
09c1d0
+	CPU_HASWELL_EPEX,
09c1d0
 };
09c1d0
 
09c1d0
 enum option_ranges {
09c1d0
diff -urNp mcelog-d2e13bf0.orig/p4.c mcelog-d2e13bf0/p4.c
09c1d0
--- mcelog-d2e13bf0.orig/p4.c	2014-09-08 09:59:39.621699344 -0400
09c1d0
+++ mcelog-d2e13bf0/p4.c	2014-09-08 09:59:52.999327768 -0400
09c1d0
@@ -33,6 +33,7 @@
09c1d0
 #include "bitfield.h"
09c1d0
 #include "sandy-bridge.h"
09c1d0
 #include "ivy-bridge.h"
09c1d0
+#include "haswell.h"
09c1d0
 
09c1d0
 /* decode mce for P4/Xeon and Core2 family */
09c1d0
 
09c1d0
@@ -360,6 +361,9 @@ void decode_intel_mc(struct mce *log, in
09c1d0
 	case CPU_IVY_BRIDGE_EPEX:
09c1d0
 		ivb_decode_model(cputype, log->bank, log->status, log->misc);
09c1d0
 		break;
09c1d0
+	case CPU_HASWELL_EPEX:
09c1d0
+		hsw_decode_model(cputype, log->bank, log->status, log->misc);
09c1d0
+		break;
09c1d0
 	}
09c1d0
 
09c1d0
 	/* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values
09c1d0
diff -urNp mcelog-d2e13bf0.orig/sandy-bridge.c mcelog-d2e13bf0/sandy-bridge.c
09c1d0
--- mcelog-d2e13bf0.orig/sandy-bridge.c	2014-09-08 09:59:39.616699077 -0400
09c1d0
+++ mcelog-d2e13bf0/sandy-bridge.c	2014-09-08 09:59:56.033470497 -0400
09c1d0
@@ -63,18 +63,14 @@ static struct field pcu_mc4[] = {
09c1d0
 	{}
09c1d0
 };
09c1d0
 
09c1d0
-static char *memctrl_1[] = {
09c1d0
-	[0x001] = "Address parity error",
09c1d0
-	[0x002] = "HA Wrt buffer Data parity error",
09c1d0
-	[0x004] = "HA Wrt byte enable parity error",
09c1d0
-	[0x008] = "Corrected patrol scrub error",
09c1d0
-	[0x010] = "Uncorrected patrol scrub error",
09c1d0
-	[0x020] = "Corrected spare error",
09c1d0
-	[0x040] = "Uncorrected spare error",
09c1d0
-};
09c1d0
-
09c1d0
 static struct field memctrl_mc8[] = {
09c1d0
-	FIELD(16, memctrl_1),
09c1d0
+	SBITFIELD(16, "Address parity error"),
09c1d0
+	SBITFIELD(17, "HA Wrt buffer Data parity error"),
09c1d0
+	SBITFIELD(18, "HA Wrt byte enable parity error"),
09c1d0
+	SBITFIELD(19, "Corrected patrol scrub error"),
09c1d0
+	SBITFIELD(20, "Uncorrected patrol scrub error"),
09c1d0
+	SBITFIELD(21, "Corrected spare error"),
09c1d0
+	SBITFIELD(22, "Uncorrected spare error"),
09c1d0
 	{}
09c1d0
 };
09c1d0