Blame SOURCES/mcelog-patch-6ed93e30f835.patch

09c1d0
From: Prarit Bhargava <prarit@redhat.com>
09c1d0
09c1d0
Subject: mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
09c1d0
09c1d0
commit 6ed93e30f83519b0ab71f8ecd156b8ff0b2912b6
09c1d0
Author: Tony Luck <tony.luck@intel.com>
09c1d0
Date:   Mon Sep 24 11:14:45 2018 -0700
09c1d0
09c1d0
    mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
09c1d0
    
09c1d0
    Ivy Bridge was the last system that gave us enough information
09c1d0
    to figure out the exact DIMM that is the source of a memory error.
09c1d0
    We gave up on DIMM logging at that point.
09c1d0
    
09c1d0
    But we can still figure out the socket, memory controller and channel.
09c1d0
    
09c1d0
    Signed-off-by: Tony Luck <tony.luck@intel.com>
09c1d0
    Signed-off-by: Andi Kleen <ak@linux.intel.com>
09c1d0
09c1d0
diff --git a/haswell.c b/haswell.c
09c1d0
index 892ebc7248e808248798f21506b54faca147db9b..4eccbeb21a281467495e024b376d81be96b2183e 100644
09c1d0
--- a/haswell.c
09c1d0
+++ b/haswell.c
09c1d0
@@ -148,3 +148,45 @@ void hsw_decode_model(int cputype, int bank, u64 status, u64 misc)
09c1d0
 		break;
09c1d0
 	}
09c1d0
 }
09c1d0
+
09c1d0
+/*
09c1d0
+ * There isn't enough information to identify the DIMM. But
09c1d0
+ * we can derive the channel from the bank number.
09c1d0
+ * There can be two memory controllers. We number the channels
09c1d0
+ * on the second controller: 4, 5, 6, 7
09c1d0
+ */
09c1d0
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm)
09c1d0
+{
09c1d0
+	u64 status = m->status;
09c1d0
+	unsigned	chan;
09c1d0
+
09c1d0
+	/* Check this is a memory error */
09c1d0
+	if (!test_prefix(7, status & 0xefff))
09c1d0
+		return;
09c1d0
+
09c1d0
+	chan = EXTRACT(status, 0, 3);
09c1d0
+	if (chan == 0xf)
09c1d0
+		return;
09c1d0
+
09c1d0
+	switch (m->bank) {
09c1d0
+	case 7:
09c1d0
+		/* Home agent 0 */
09c1d0
+		break;
09c1d0
+	case 8:
09c1d0
+		/* Home agent 1 */
09c1d0
+		chan += 4;
09c1d0
+		break;
09c1d0
+	case 9: case 10: case 11: case 12:
09c1d0
+		/* Memory controller 0 */
09c1d0
+		chan = m->bank - 9;
09c1d0
+		break;
09c1d0
+	case 13: case 14: case 15: case 16:
09c1d0
+		/* Memory controller 1 */
09c1d0
+		chan = (m->bank - 13) + 4;
09c1d0
+		break;
09c1d0
+	default:
09c1d0
+		return;
09c1d0
+	}
09c1d0
+
09c1d0
+	channel[0] = chan;
09c1d0
+}
09c1d0
diff --git a/haswell.h b/haswell.h
09c1d0
index ba3fb1c3c985aec0ac1a0a271dca3c3afd18874c..712c8eb66d50a1bf63a7dbd67382fe775b59d69b 100644
09c1d0
--- a/haswell.h
09c1d0
+++ b/haswell.h
09c1d0
@@ -1,2 +1,3 @@
09c1d0
 void hsw_decode_model(int cputype, int bank, u64 status, u64 misc);
09c1d0
 void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm);
09c1d0
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm);
09c1d0
diff --git a/intel.c b/intel.c
09c1d0
index 20d2acdc12daa1128d72471d53639aebf82f4854..b655c4162f8980d5d826640fa4375c7ba6b1e97d 100644
09c1d0
--- a/intel.c
09c1d0
+++ b/intel.c
09c1d0
@@ -25,6 +25,7 @@
09c1d0
 #include "sandy-bridge.h"
09c1d0
 #include "ivy-bridge.h"
09c1d0
 #include "haswell.h"
09c1d0
+#include "skylake_xeon.h"
09c1d0
 
09c1d0
 int memory_error_support;
09c1d0
 
09c1d0
@@ -140,6 +141,13 @@ static int intel_memory_error(struct mce *m, unsigned recordlen)
09c1d0
 		case CPU_IVY_BRIDGE_EPEX:
09c1d0
 			ivy_bridge_ep_memerr_misc(m, channel, dimm);
09c1d0
 			break;
09c1d0
+		case CPU_HASWELL_EPEX:
09c1d0
+		case CPU_BROADWELL_EPEX:
09c1d0
+			haswell_memerr_misc(m, channel, dimm);
09c1d0
+			break;
09c1d0
+		case CPU_SKYLAKE_XEON:
09c1d0
+			skylake_memerr_misc(m, channel, dimm);
09c1d0
+			break;
09c1d0
 		default:
09c1d0
 			break;
09c1d0
 		} 
09c1d0
diff --git a/skylake_xeon.c b/skylake_xeon.c
09c1d0
index 16c6181987f0126d377b64a8f5d4a96a01bfa1c4..b02f8acd806e2a64ed1653f44349fd3e9abf374e 100644
09c1d0
--- a/skylake_xeon.c
09c1d0
+++ b/skylake_xeon.c
09c1d0
@@ -228,3 +228,45 @@ int skylake_s_ce_type(int bank, u64 status, u64 misc)
09c1d0
 
09c1d0
 	return 0;
09c1d0
 }
09c1d0
+
09c1d0
+/*
09c1d0
+ * There isn't enough information to identify the DIMM. But
09c1d0
+ * we can derive the channel from the bank number.
09c1d0
+ * There can be two memory controllers. We number the channels
09c1d0
+ * on the second controller: 3, 4, 5
09c1d0
+ */
09c1d0
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm)
09c1d0
+{
09c1d0
+	u64 status = m->status;
09c1d0
+	unsigned	chan;
09c1d0
+
09c1d0
+	/* Check this is a memory error */
09c1d0
+	if (!test_prefix(7, status & 0xefff))
09c1d0
+		return;
09c1d0
+
09c1d0
+	chan = EXTRACT(status, 0, 3);
09c1d0
+	if (chan == 0xf)
09c1d0
+		return;
09c1d0
+
09c1d0
+	switch (m->bank) {
09c1d0
+	case 7:
09c1d0
+		/* Home agent 0 */
09c1d0
+		break;
09c1d0
+	case 8:
09c1d0
+		/* Home agent 1 */
09c1d0
+		chan += 3;
09c1d0
+		break;
09c1d0
+	case 13: case 14: case 15:
09c1d0
+		/* Memory controller 0 */
09c1d0
+		chan = m->bank - 13;
09c1d0
+		break;
09c1d0
+	case 16: case 17: case 18:
09c1d0
+		/* Memory controller 1 */
09c1d0
+		chan = (m->bank - 16) + 3;
09c1d0
+		break;
09c1d0
+	default:
09c1d0
+		return;
09c1d0
+	}
09c1d0
+
09c1d0
+	channel[0] = chan;
09c1d0
+}
09c1d0
diff --git a/skylake_xeon.h b/skylake_xeon.h
09c1d0
index edcd9c030fa70f10ac23f2df9be948b10c73f4a1..098e6fa0e3eaff1b1d7e3040eddfb9187dabd7dd 100644
09c1d0
--- a/skylake_xeon.h
09c1d0
+++ b/skylake_xeon.h
09c1d0
@@ -1,2 +1,3 @@
09c1d0
 void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc);
09c1d0
 int skylake_s_ce_type(int bank, u64 status, u64 misc);
09c1d0
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm);