From: Prarit Bhargava <prarit@redhat.com>
Subject: mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
commit 6ed93e30f83519b0ab71f8ecd156b8ff0b2912b6
Author: Tony Luck <tony.luck@intel.com>
Date: Mon Sep 24 11:14:45 2018 -0700
mcelog: Deduce channel number for Haswell/Broadwell/Skylake systems
Ivy Bridge was the last system that gave us enough information
to figure out the exact DIMM that is the source of a memory error.
We gave up on DIMM logging at that point.
But we can still figure out the socket, memory controller and channel.
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
diff --git a/haswell.c b/haswell.c
index 892ebc7248e808248798f21506b54faca147db9b..4eccbeb21a281467495e024b376d81be96b2183e 100644
--- a/haswell.c
+++ b/haswell.c
@@ -148,3 +148,45 @@ void hsw_decode_model(int cputype, int bank, u64 status, u64 misc)
break;
}
}
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be two memory controllers. We number the channels
+ * on the second controller: 4, 5, 6, 7
+ */
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm)
+{
+ u64 status = m->status;
+ unsigned chan;
+
+ /* Check this is a memory error */
+ if (!test_prefix(7, status & 0xefff))
+ return;
+
+ chan = EXTRACT(status, 0, 3);
+ if (chan == 0xf)
+ return;
+
+ switch (m->bank) {
+ case 7:
+ /* Home agent 0 */
+ break;
+ case 8:
+ /* Home agent 1 */
+ chan += 4;
+ break;
+ case 9: case 10: case 11: case 12:
+ /* Memory controller 0 */
+ chan = m->bank - 9;
+ break;
+ case 13: case 14: case 15: case 16:
+ /* Memory controller 1 */
+ chan = (m->bank - 13) + 4;
+ break;
+ default:
+ return;
+ }
+
+ channel[0] = chan;
+}
diff --git a/haswell.h b/haswell.h
index ba3fb1c3c985aec0ac1a0a271dca3c3afd18874c..712c8eb66d50a1bf63a7dbd67382fe775b59d69b 100644
--- a/haswell.h
+++ b/haswell.h
@@ -1,2 +1,3 @@
void hsw_decode_model(int cputype, int bank, u64 status, u64 misc);
void haswell_ep_memerr_misc(struct mce *m, int *channel, int *dimm);
+void haswell_memerr_misc(struct mce *m, int *channel, int *dimm);
diff --git a/intel.c b/intel.c
index 20d2acdc12daa1128d72471d53639aebf82f4854..b655c4162f8980d5d826640fa4375c7ba6b1e97d 100644
--- a/intel.c
+++ b/intel.c
@@ -25,6 +25,7 @@
#include "sandy-bridge.h"
#include "ivy-bridge.h"
#include "haswell.h"
+#include "skylake_xeon.h"
int memory_error_support;
@@ -140,6 +141,13 @@ static int intel_memory_error(struct mce *m, unsigned recordlen)
case CPU_IVY_BRIDGE_EPEX:
ivy_bridge_ep_memerr_misc(m, channel, dimm);
break;
+ case CPU_HASWELL_EPEX:
+ case CPU_BROADWELL_EPEX:
+ haswell_memerr_misc(m, channel, dimm);
+ break;
+ case CPU_SKYLAKE_XEON:
+ skylake_memerr_misc(m, channel, dimm);
+ break;
default:
break;
}
diff --git a/skylake_xeon.c b/skylake_xeon.c
index 16c6181987f0126d377b64a8f5d4a96a01bfa1c4..b02f8acd806e2a64ed1653f44349fd3e9abf374e 100644
--- a/skylake_xeon.c
+++ b/skylake_xeon.c
@@ -228,3 +228,45 @@ int skylake_s_ce_type(int bank, u64 status, u64 misc)
return 0;
}
+
+/*
+ * There isn't enough information to identify the DIMM. But
+ * we can derive the channel from the bank number.
+ * There can be two memory controllers. We number the channels
+ * on the second controller: 3, 4, 5
+ */
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm)
+{
+ u64 status = m->status;
+ unsigned chan;
+
+ /* Check this is a memory error */
+ if (!test_prefix(7, status & 0xefff))
+ return;
+
+ chan = EXTRACT(status, 0, 3);
+ if (chan == 0xf)
+ return;
+
+ switch (m->bank) {
+ case 7:
+ /* Home agent 0 */
+ break;
+ case 8:
+ /* Home agent 1 */
+ chan += 3;
+ break;
+ case 13: case 14: case 15:
+ /* Memory controller 0 */
+ chan = m->bank - 13;
+ break;
+ case 16: case 17: case 18:
+ /* Memory controller 1 */
+ chan = (m->bank - 16) + 3;
+ break;
+ default:
+ return;
+ }
+
+ channel[0] = chan;
+}
diff --git a/skylake_xeon.h b/skylake_xeon.h
index edcd9c030fa70f10ac23f2df9be948b10c73f4a1..098e6fa0e3eaff1b1d7e3040eddfb9187dabd7dd 100644
--- a/skylake_xeon.h
+++ b/skylake_xeon.h
@@ -1,2 +1,3 @@
void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc);
int skylake_s_ce_type(int bank, u64 status, u64 misc);
+void skylake_memerr_misc(struct mce *m, int *channel, int *dimm);