|
|
09c1d0 |
diff --git a/CHANGES b/CHANGES
|
|
|
09c1d0 |
index cd279c4..e3c4044 100644
|
|
|
09c1d0 |
--- a/CHANGES
|
|
|
09c1d0 |
+++ b/CHANGES
|
|
|
09c1d0 |
@@ -1,5 +1,9 @@
|
|
|
09c1d0 |
<newer changes first>
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+Changes file is obsolete.
|
|
|
09c1d0 |
+Please see git log on https://git.kernel.org/cgit/utils/cpu/mce/mcelog.git/
|
|
|
09c1d0 |
+for newer changes.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
Add Linux Kongress 2010 paper
|
|
|
09c1d0 |
Add Sandy Bridge Support
|
|
|
09c1d0 |
Write pid file by default in daemon mode
|
|
|
09c1d0 |
diff --git a/Makefile b/Makefile
|
|
|
09c1d0 |
index f8199f6..f3ba998 100644
|
|
|
09c1d0 |
--- a/Makefile
|
|
|
09c1d0 |
+++ b/Makefile
|
|
|
09c1d0 |
@@ -1,6 +1,7 @@
|
|
|
09c1d0 |
CFLAGS := -g -Os
|
|
|
09c1d0 |
prefix := /usr
|
|
|
09c1d0 |
etcprefix :=
|
|
|
09c1d0 |
+MANDIR := ${prefix}/share/man
|
|
|
09c1d0 |
# Define appropiately for your distribution
|
|
|
09c1d0 |
# DOCDIR := /usr/share/doc/packages/mcelog
|
|
|
09c1d0 |
|
|
|
09c1d0 |
@@ -54,21 +55,27 @@ SRC := $(OBJ:.o=.c)
|
|
|
09c1d0 |
mcelog: ${OBJ}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
# dbquery intentionally not installed by default
|
|
|
09c1d0 |
-install: mcelog
|
|
|
09c1d0 |
- mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)${prefix}/share/man/man8
|
|
|
09c1d0 |
+install: mcelog mcelog.conf mcelog.conf.5 mcelog.triggers.5
|
|
|
09c1d0 |
+ mkdir -p $(DESTDIR)${etcprefix}/etc/mcelog $(DESTDIR)${prefix}/sbin $(DESTDIR)$(MANDIR)/man5 $(DESTDIR)$(MANDIR)/man8
|
|
|
09c1d0 |
install -m 755 -p mcelog $(DESTDIR)${prefix}/sbin/mcelog
|
|
|
09c1d0 |
- install -m 644 -p mcelog.8 $(DESTDIR)${prefix}/share/man/man8
|
|
|
09c1d0 |
+ install -m 644 -p mcelog.8 $(DESTDIR)$(MANDIR)/man8
|
|
|
09c1d0 |
+ install -m 644 -p mcelog.conf.5 $(DESTDIR)$(MANDIR)/man5
|
|
|
09c1d0 |
+ install -m 644 -p mcelog.triggers.5 $(DESTDIR)$(MANDIR)/man5
|
|
|
09c1d0 |
install -m 644 -p -b mcelog.conf $(DESTDIR)${etcprefix}/etc/mcelog/mcelog.conf
|
|
|
09c1d0 |
for i in ${TRIGGERS} ; do \
|
|
|
09c1d0 |
install -m 755 -p -b triggers/$$i $(DESTDIR)${etcprefix}/etc/mcelog ; \
|
|
|
09c1d0 |
done
|
|
|
09c1d0 |
ifdef DOCDIR
|
|
|
09c1d0 |
+ install -d 755 $(DESTDIR)${DOCDIR}
|
|
|
09c1d0 |
install -m 644 -p ${DOC} $(DESTDIR)${DOCDIR}
|
|
|
09c1d0 |
else
|
|
|
09c1d0 |
echo
|
|
|
09c1d0 |
echo "Consider defining DOCDIR to install additional documentation"
|
|
|
09c1d0 |
endif
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+mcelog.conf.5: mcelog.conf config-intro.man
|
|
|
09c1d0 |
+ ./genconfig.py mcelog.conf config-intro.man > mcelog.conf.5
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
clean: test-clean
|
|
|
09c1d0 |
rm -f ${CLEAN} ${OBJ}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
diff --git a/README b/README
|
|
|
09c1d0 |
index 08184ed..8aa8ec4 100644
|
|
|
09c1d0 |
--- a/README
|
|
|
09c1d0 |
+++ b/README
|
|
|
09c1d0 |
@@ -2,11 +2,15 @@ mcelog is the user space backend for logging machine check errors
|
|
|
09c1d0 |
reported by the hardware to the kernel. The kernel does the immediate
|
|
|
09c1d0 |
actions (like killing processes etc.) and mcelog decodes the errors
|
|
|
09c1d0 |
and manages various other advanced error responses like
|
|
|
09c1d0 |
-offlining memory, CPUs or triggering events.
|
|
|
09c1d0 |
+offlining memory, CPUs or triggering events. In addition
|
|
|
09c1d0 |
+mcelog also handles corrected errors, by logging and accounting them.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
It primarily handles machine checks and thermal events, which
|
|
|
09c1d0 |
are reported for errors detected by the CPU.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+For more details on what mcelog can do and the underlying theory
|
|
|
09c1d0 |
+see http://www.mcelog.org
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
It is recommended that mcelog runs on all x86 machines, both
|
|
|
09c1d0 |
64bit (since early 2.6) and 32bit (since 2.6.32)
|
|
|
09c1d0 |
|
|
|
09c1d0 |
@@ -40,6 +44,11 @@ mce.pdf is a very old paper describing the first releases of mcelog
|
|
|
09c1d0 |
|
|
|
09c1d0 |
For distributors:
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+You can run mcelog from systemd or similar daemons. An example
|
|
|
09c1d0 |
+systemd unit file is in mcelog.service.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+For older distributions using init scripts:
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
Please install a init script by default that runs mcelog in daemon mode.
|
|
|
09c1d0 |
The mcelog.init script is a good starting point.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
diff --git a/client.c b/client.c
|
|
|
09c1d0 |
index 6a67683..7c7aeb8 100644
|
|
|
09c1d0 |
--- a/client.c
|
|
|
09c1d0 |
+++ b/client.c
|
|
|
09c1d0 |
@@ -29,9 +29,9 @@ void ask_server(char *command)
|
|
|
09c1d0 |
{
|
|
|
09c1d0 |
struct sockaddr_un sun;
|
|
|
09c1d0 |
int fd;
|
|
|
09c1d0 |
+ FILE * fp;
|
|
|
09c1d0 |
int n;
|
|
|
09c1d0 |
char buf[1024];
|
|
|
09c1d0 |
- int done;
|
|
|
09c1d0 |
char *path = config_string("server", "socket-path");
|
|
|
09c1d0 |
if (!path)
|
|
|
09c1d0 |
path = SOCKET_PATH;
|
|
|
09c1d0 |
@@ -52,14 +52,18 @@ void ask_server(char *command)
|
|
|
09c1d0 |
if (write(fd, command, n) != n)
|
|
|
09c1d0 |
SYSERRprintf("client command write");
|
|
|
09c1d0 |
|
|
|
09c1d0 |
- done = 0;
|
|
|
09c1d0 |
- while (!done && (n = read(fd, buf, sizeof buf)) > 0) {
|
|
|
09c1d0 |
- if (n >= 5 && !memcmp(buf + n - 5, "done\n", 5)) {
|
|
|
09c1d0 |
- n -= 5;
|
|
|
09c1d0 |
- done = 1;
|
|
|
09c1d0 |
+ if ((fp = fdopen(fd, "r")) != NULL) {
|
|
|
09c1d0 |
+ while (fgets(buf, sizeof buf, fp)) {
|
|
|
09c1d0 |
+ n = strlen(buf);
|
|
|
09c1d0 |
+ if (n >= 5 && !memcmp(buf + n - 5, "done\n", 5)) {
|
|
|
09c1d0 |
+ fclose(fp);
|
|
|
09c1d0 |
+ return;
|
|
|
09c1d0 |
+ }
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ fputs(buf, stdout);
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
- write(1, buf, n);
|
|
|
09c1d0 |
+ fclose(fp);
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
- if (n < 0)
|
|
|
09c1d0 |
- SYSERRprintf("client read");
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ SYSERRprintf("client read");
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
diff --git a/config-intro.man b/config-intro.man
|
|
|
09c1d0 |
new file mode 100644
|
|
|
09c1d0 |
index 0000000..c06610d
|
|
|
09c1d0 |
--- /dev/null
|
|
|
09c1d0 |
+++ b/config-intro.man
|
|
|
09c1d0 |
@@ -0,0 +1,10 @@
|
|
|
09c1d0 |
+.SH NAME
|
|
|
09c1d0 |
+mcelog.conf \- mcelog.conf reference
|
|
|
09c1d0 |
+.SH SYNOPSIS
|
|
|
09c1d0 |
+.B /etc/mcelog.conf
|
|
|
09c1d0 |
+.SH DESCRIPTION
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+/etc/mcelog.conf is the main configuration file for
|
|
|
09c1d0 |
+.B mcelog(8).
|
|
|
09c1d0 |
+This is configuration file separated into sections including
|
|
|
09c1d0 |
+a default section.
|
|
|
09c1d0 |
diff --git a/dmi.c b/dmi.c
|
|
|
09c1d0 |
index 290a053..b5492cd 100644
|
|
|
09c1d0 |
--- a/dmi.c
|
|
|
09c1d0 |
+++ b/dmi.c
|
|
|
09c1d0 |
@@ -162,6 +162,8 @@ static int get_efi_base_addr(size_t *address)
|
|
|
09c1d0 |
check_symbol:
|
|
|
09c1d0 |
while ((fgets(linebuf, sizeof(linebuf) - 1, efi_systab)) != NULL) {
|
|
|
09c1d0 |
char *addrp = strchr(linebuf, '=');
|
|
|
09c1d0 |
+ if (!addrp)
|
|
|
09c1d0 |
+ break;
|
|
|
09c1d0 |
*(addrp++) = '\0';
|
|
|
09c1d0 |
|
|
|
09c1d0 |
if (strcmp(linebuf, "SMBIOS") == 0) {
|
|
|
09c1d0 |
diff --git a/genconfig.py b/genconfig.py
|
|
|
09c1d0 |
new file mode 100755
|
|
|
09c1d0 |
index 0000000..aed6992
|
|
|
09c1d0 |
--- /dev/null
|
|
|
09c1d0 |
+++ b/genconfig.py
|
|
|
09c1d0 |
@@ -0,0 +1,80 @@
|
|
|
09c1d0 |
+#!/usr/bin/python
|
|
|
09c1d0 |
+# generate man config documentation from mcelog.conf example
|
|
|
09c1d0 |
+# genconfig.py mcelog.conf intro.html
|
|
|
09c1d0 |
+import sys
|
|
|
09c1d0 |
+import re
|
|
|
09c1d0 |
+import string
|
|
|
09c1d0 |
+import argparse
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ap = argparse.ArgumentParser(description="generate man config documentation from mcelog.conf example")
|
|
|
09c1d0 |
+ap.add_argument('config', type=argparse.FileType('r'), help="mcelog example config file")
|
|
|
09c1d0 |
+ap.add_argument('intro', type=argparse.FileType('r'), help="intro file")
|
|
|
09c1d0 |
+args = ap.parse_args()
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+def parse(f):
|
|
|
09c1d0 |
+ lineno = 1
|
|
|
09c1d0 |
+ explanation = 0
|
|
|
09c1d0 |
+ header = 1
|
|
|
09c1d0 |
+ for line in f:
|
|
|
09c1d0 |
+ lineno += 1
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ # skip first comment
|
|
|
09c1d0 |
+ if header:
|
|
|
09c1d0 |
+ if not re.match('^#', line):
|
|
|
09c1d0 |
+ header = 0
|
|
|
09c1d0 |
+ continue
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ # explanation
|
|
|
09c1d0 |
+ m = re.match('^#\s(.*)', line)
|
|
|
09c1d0 |
+ if m:
|
|
|
09c1d0 |
+ explanation += 1
|
|
|
09c1d0 |
+ s = m.group(1)
|
|
|
09c1d0 |
+ if explanation == 1:
|
|
|
09c1d0 |
+ s = string.capitalize(s)
|
|
|
09c1d0 |
+ print s
|
|
|
09c1d0 |
+ continue
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ if explanation:
|
|
|
09c1d0 |
+ print ".PP"
|
|
|
09c1d0 |
+ explanation = 0
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ # empty line: new option
|
|
|
09c1d0 |
+ if re.match('\s+', line):
|
|
|
09c1d0 |
+ new_option()
|
|
|
09c1d0 |
+ continue
|
|
|
09c1d0 |
+ # group
|
|
|
09c1d0 |
+ m = re.match('\[(.*)\]', line)
|
|
|
09c1d0 |
+ if m:
|
|
|
09c1d0 |
+ start_group(m.group(1))
|
|
|
09c1d0 |
+ continue
|
|
|
09c1d0 |
+ # config option
|
|
|
09c1d0 |
+ m = re.match('^(#?)([a-z-]+) = (.*)', line)
|
|
|
09c1d0 |
+ if m:
|
|
|
09c1d0 |
+ config_option(m.group(1), m.group(2), m.group(3))
|
|
|
09c1d0 |
+ continue
|
|
|
09c1d0 |
+ print >>sys.stderr, "Unparseable line %d" % (lineno-1)
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+def config_option(enabled, name, value):
|
|
|
09c1d0 |
+ print ".B %s = %s" % (name, value)
|
|
|
09c1d0 |
+ print ".PP"
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+def start_group(name):
|
|
|
09c1d0 |
+ print ".SS \"The %s config section\"" % (name)
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+def new_option():
|
|
|
09c1d0 |
+ print ".PP"
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+print """
|
|
|
09c1d0 |
+.\" Auto generated mcelog.conf manpage. Do not edit.
|
|
|
09c1d0 |
+.TH "mcelog.conf" 5 "mcelog"
|
|
|
09c1d0 |
+"""
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+print args.intro.read()
|
|
|
09c1d0 |
+parse(args.config)
|
|
|
09c1d0 |
+print """
|
|
|
09c1d0 |
+.SH SEE ALSO
|
|
|
09c1d0 |
+.BR mcelog (8),
|
|
|
09c1d0 |
+.BR mcelog.triggers (5)
|
|
|
09c1d0 |
+.B http://www.mcelog.org
|
|
|
09c1d0 |
+"""
|
|
|
09c1d0 |
diff --git a/haswell.c b/haswell.c
|
|
|
09c1d0 |
index 0fef6a5..b309ae5 100644
|
|
|
09c1d0 |
--- a/haswell.c
|
|
|
09c1d0 |
+++ b/haswell.c
|
|
|
09c1d0 |
@@ -1,5 +1,5 @@
|
|
|
09c1d0 |
/* Copyright (C) 2013 Intel Corporation
|
|
|
09c1d0 |
- Decode Intel Ivy Bridge specific machine check errors.
|
|
|
09c1d0 |
+ Decode Intel Haswell specific machine check errors.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
mcelog is free software; you can redistribute it and/or
|
|
|
09c1d0 |
modify it under the terms of the GNU General Public
|
|
|
09c1d0 |
diff --git a/intel.c b/intel.c
|
|
|
09c1d0 |
index fe08eab..f893be5 100644
|
|
|
09c1d0 |
--- a/intel.c
|
|
|
09c1d0 |
+++ b/intel.c
|
|
|
09c1d0 |
@@ -34,7 +34,8 @@ void intel_cpu_init(enum cputype cpu)
|
|
|
09c1d0 |
if (cpu == CPU_NEHALEM || cpu == CPU_XEON75XX || cpu == CPU_INTEL ||
|
|
|
09c1d0 |
cpu == CPU_SANDY_BRIDGE || cpu == CPU_SANDY_BRIDGE_EP ||
|
|
|
09c1d0 |
cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX ||
|
|
|
09c1d0 |
- cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX)
|
|
|
09c1d0 |
+ cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL ||
|
|
|
09c1d0 |
+ cpu == CPU_KNIGHTS_LANDING)
|
|
|
09c1d0 |
memory_error_support = 1;
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
@@ -72,6 +73,15 @@ enum cputype select_intel_cputype(int family, int model)
|
|
|
09c1d0 |
return CPU_HASWELL;
|
|
|
09c1d0 |
else if (model == 0x3f)
|
|
|
09c1d0 |
return CPU_HASWELL_EPEX;
|
|
|
09c1d0 |
+ else if (model == 0x3d || model == 0x56)
|
|
|
09c1d0 |
+ return CPU_BROADWELL;
|
|
|
09c1d0 |
+ else if (model == 0x57)
|
|
|
09c1d0 |
+ return CPU_KNIGHTS_LANDING;
|
|
|
09c1d0 |
+ else if (model == 0x1c || model == 0x26 || model == 0x27 ||
|
|
|
09c1d0 |
+ model == 0x35 || model == 0x36 || model == 0x36 ||
|
|
|
09c1d0 |
+ model == 0x37 || model == 0x4a || model == 0x4c ||
|
|
|
09c1d0 |
+ model == 0x4d || model == 0x5a || model == 0x5d)
|
|
|
09c1d0 |
+ return CPU_ATOM;
|
|
|
09c1d0 |
if (model > 0x1a) {
|
|
|
09c1d0 |
Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n",
|
|
|
09c1d0 |
model);
|
|
|
09c1d0 |
diff --git a/intel.h b/intel.h
|
|
|
09c1d0 |
index 00191d5..9d109b1 100644
|
|
|
09c1d0 |
--- a/intel.h
|
|
|
09c1d0 |
+++ b/intel.h
|
|
|
09c1d0 |
@@ -19,5 +19,7 @@ extern int memory_error_support;
|
|
|
09c1d0 |
case CPU_IVY_BRIDGE: \
|
|
|
09c1d0 |
case CPU_IVY_BRIDGE_EPEX: \
|
|
|
09c1d0 |
case CPU_HASWELL: \
|
|
|
09c1d0 |
- case CPU_HASWELL_EPEX
|
|
|
09c1d0 |
+ case CPU_HASWELL_EPEX: \
|
|
|
09c1d0 |
+ case CPU_BROADWELL: \
|
|
|
09c1d0 |
+ case CPU_KNIGHTS_LANDING
|
|
|
09c1d0 |
|
|
|
09c1d0 |
diff --git a/leaky-bucket.c b/leaky-bucket.c
|
|
|
09c1d0 |
index c2c501b..721ab22 100644
|
|
|
09c1d0 |
--- a/leaky-bucket.c
|
|
|
09c1d0 |
+++ b/leaky-bucket.c
|
|
|
09c1d0 |
@@ -25,7 +25,7 @@ time_t __attribute__((weak)) bucket_time(void)
|
|
|
09c1d0 |
return time(NULL);
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-static void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b,
|
|
|
09c1d0 |
+void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b,
|
|
|
09c1d0 |
time_t now)
|
|
|
09c1d0 |
{
|
|
|
09c1d0 |
long diff;
|
|
|
09c1d0 |
diff --git a/leaky-bucket.h b/leaky-bucket.h
|
|
|
09c1d0 |
index 497719e..860ba3c 100644
|
|
|
09c1d0 |
--- a/leaky-bucket.h
|
|
|
09c1d0 |
+++ b/leaky-bucket.h
|
|
|
09c1d0 |
@@ -27,5 +27,7 @@ char *bucket_output(const struct bucket_conf *c, struct leaky_bucket *b);
|
|
|
09c1d0 |
int bucket_conf_init(struct bucket_conf *c, const char *rate);
|
|
|
09c1d0 |
void bucket_init(struct leaky_bucket *b);
|
|
|
09c1d0 |
time_t bucket_time(void);
|
|
|
09c1d0 |
+void bucket_age(const struct bucket_conf *c, struct leaky_bucket *b,
|
|
|
09c1d0 |
+ time_t now);
|
|
|
09c1d0 |
|
|
|
09c1d0 |
#endif
|
|
|
09c1d0 |
diff --git a/mcelog.8 b/mcelog.8
|
|
|
09c1d0 |
index f8a77c4..3781db6 100644
|
|
|
09c1d0 |
--- a/mcelog.8
|
|
|
09c1d0 |
+++ b/mcelog.8
|
|
|
09c1d0 |
@@ -1,5 +1,4 @@
|
|
|
09c1d0 |
-.\" disk db commented out for now because it's not usable enough
|
|
|
09c1d0 |
-.TH MCELOG 8 "May 2009" "" "Linux's Administrator's Manual"
|
|
|
09c1d0 |
+.TH MCELOG 8 "Mar 2015" "" "Linux's Administrator's Manual"
|
|
|
09c1d0 |
.SH NAME
|
|
|
09c1d0 |
mcelog \- Decode kernel machine check log on x86 machines
|
|
|
09c1d0 |
.SH SYNOPSIS
|
|
|
09c1d0 |
@@ -26,13 +25,16 @@ in main memory by an integrated memory controller, data
|
|
|
09c1d0 |
transfer errors on the front side bus or CPU interconnect or other internal
|
|
|
09c1d0 |
errors.
|
|
|
09c1d0 |
Possible causes can be cosmic radiation, instable power supplies,
|
|
|
09c1d0 |
-cooling problems, broken hardware, or bad luck.
|
|
|
09c1d0 |
+cooling problems, broken hardware, running systems out of specification,
|
|
|
09c1d0 |
+or bad luck.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
Most errors can be corrected by the CPU by internal error correction
|
|
|
09c1d0 |
mechanisms. Uncorrected errors cause machine check exceptions which
|
|
|
09c1d0 |
-may panic the machine.
|
|
|
09c1d0 |
+may kill processes or panic the machine. A small number of corrected
|
|
|
09c1d0 |
+errors is usually not a cause for worry, but a large number can indicate
|
|
|
09c1d0 |
+future failure.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-When a corrected error happens the x86 kernel writes a record describing
|
|
|
09c1d0 |
+When a corrected or recovered error happens the x86 kernel writes a record describing
|
|
|
09c1d0 |
the MCE into a internal ring buffer available through the
|
|
|
09c1d0 |
.I /dev/mcelog
|
|
|
09c1d0 |
device
|
|
|
09c1d0 |
@@ -43,7 +45,11 @@ decodes them into a human readable format and prints them
|
|
|
09c1d0 |
on the standard output or optionally into the system log.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
Optionally it can also take more options like keeping statistics or
|
|
|
09c1d0 |
-triggering shell scripts on specific events.
|
|
|
09c1d0 |
+triggering shell scripts on specific events. By default mcelog
|
|
|
09c1d0 |
+supports offlining memory pages with persistent corrected errors,
|
|
|
09c1d0 |
+offlining CPU cores if they developed cache problems,
|
|
|
09c1d0 |
+and otherwise logging specific events to the system log after
|
|
|
09c1d0 |
+they crossed a threshold.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
The normal operating modi for mcelog are running
|
|
|
09c1d0 |
as a regular cron job (traditional way, deprecated),
|
|
|
09c1d0 |
@@ -112,12 +118,12 @@ and undocumented now.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
With the
|
|
|
09c1d0 |
.B \-\-dmi
|
|
|
09c1d0 |
-option mcelog will look up the addresses reported in machine
|
|
|
09c1d0 |
+option mcelog will look up the DIMMs reported in machine
|
|
|
09c1d0 |
checks in the
|
|
|
09c1d0 |
.I SMBIOS/DMI
|
|
|
09c1d0 |
-tables of the BIOS.
|
|
|
09c1d0 |
-This can sometimes tell you which DIMM or memory controller
|
|
|
09c1d0 |
-has developed a problem. More often the information reported
|
|
|
09c1d0 |
+tables of the BIOS and map the DIMMs to board identifiers.
|
|
|
09c1d0 |
+This only works when the BIOS reports the identifiers correctly.
|
|
|
09c1d0 |
+Unfortunately often the information reported
|
|
|
09c1d0 |
by the BIOS is either subtly or obviously wrong or useless.
|
|
|
09c1d0 |
This option requires that mcelog has read access to /dev/mem
|
|
|
09c1d0 |
(normally requires root) and runs on the same machine
|
|
|
09c1d0 |
@@ -281,6 +287,9 @@ option use
|
|
|
09c1d0 |
use
|
|
|
09c1d0 |
.I logfile = /tmp/logfile
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+For more information on the config file please see
|
|
|
09c1d0 |
+.B mcelog.conf(5).
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
.SH NOTES
|
|
|
09c1d0 |
The kernel prefers old messages over new. If the log buffer overflows
|
|
|
09c1d0 |
only old ones will be kept.
|
|
|
09c1d0 |
@@ -308,9 +317,14 @@ restarting the daemon.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
.\"/var/lib/memory-errors
|
|
|
09c1d0 |
.SH SEE ALSO
|
|
|
09c1d0 |
+.BR mcelog.conf(5),
|
|
|
09c1d0 |
+.BR mcelog.triggers(5)
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+http://www.mcelog.org
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
AMD x86-64 architecture programmer's manual, Volume 2, System programming
|
|
|
09c1d0 |
|
|
|
09c1d0 |
Intel 64 and IA32 Architectures Software Developer's manual, Volume 3, System programming guide
|
|
|
09c1d0 |
-Parts 1 and 2. Machine checks are described in Chapter 14 in Part1 and in Appendix E in Part2.
|
|
|
09c1d0 |
+Chapter 15 and 16. http://www.intel.com/sdm
|
|
|
09c1d0 |
|
|
|
09c1d0 |
Datasheet of your CPU.
|
|
|
09c1d0 |
diff --git a/mcelog.c b/mcelog.c
|
|
|
09c1d0 |
index 95a913f..96c0a9d 100644
|
|
|
09c1d0 |
--- a/mcelog.c
|
|
|
09c1d0 |
+++ b/mcelog.c
|
|
|
09c1d0 |
@@ -231,6 +231,9 @@ static char *cputype_name[] = {
|
|
|
09c1d0 |
[CPU_IVY_BRIDGE_EPEX] = "Ivy Bridge EP/EX", /* Fill in better name */
|
|
|
09c1d0 |
[CPU_HASWELL] = "Haswell", /* Fill in better name */
|
|
|
09c1d0 |
[CPU_HASWELL_EPEX] = "Haswell EP/EX", /* Fill in better name */
|
|
|
09c1d0 |
+ [CPU_BROADWELL] = "Broadwell",
|
|
|
09c1d0 |
+ [CPU_KNIGHTS_LANDING] = "Knights Landing",
|
|
|
09c1d0 |
+ [CPU_ATOM] = "ATOM",
|
|
|
09c1d0 |
};
|
|
|
09c1d0 |
|
|
|
09c1d0 |
static struct config_choice cpu_choices[] = {
|
|
|
09c1d0 |
@@ -269,7 +272,10 @@ static struct config_choice cpu_choices[
|
|
|
09c1d0 |
{ "haswell", CPU_HASWELL }, /* Fill in better name */
|
|
|
09c1d0 |
{ "haswell-ep", CPU_HASWELL_EPEX }, /* Fill in better name */
|
|
|
09c1d0 |
{ "haswell-ex", CPU_HASWELL_EPEX }, /* Fill in better name */
|
|
|
09c1d0 |
- {}
|
|
|
09c1d0 |
+ { "broadwell", CPU_BROADWELL },
|
|
|
09c1d0 |
+ { "knightslanding", CPU_KNIGHTS_LANDING },
|
|
|
09c1d0 |
+ { "atom", CPU_ATOM },
|
|
|
09c1d0 |
+ { NULL }
|
|
|
09c1d0 |
};
|
|
|
09c1d0 |
|
|
|
09c1d0 |
static void print_cputypes(void)
|
|
|
09c1d0 |
@@ -430,7 +436,8 @@ static void dump_mce(struct mce *m, unsi
|
|
|
09c1d0 |
mod);
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX &&
|
|
|
09c1d0 |
- cputype != CPU_HASWELL_EPEX)
|
|
|
09c1d0 |
+ cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL &&
|
|
|
09c1d0 |
+ cputype != CPU_KNIGHTS_LANDING)
|
|
|
09c1d0 |
resolveaddr(m->addr);
|
|
|
09c1d0 |
if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) {
|
|
|
09c1d0 |
diskdb_resolve_addr(m->addr);
|
|
|
09c1d0 |
@@ -517,7 +524,7 @@ int is_cpu_supported(void)
|
|
|
09c1d0 |
if (family == 15) {
|
|
|
09c1d0 |
cputype = CPU_K8;
|
|
|
09c1d0 |
} else if (family >= 16) {
|
|
|
09c1d0 |
- SYSERRprintf("AMD Processor family %d: Please use the edac_mce_amd module instead.\n", family);
|
|
|
09c1d0 |
+ SYSERRprintf("ERROR: AMD Processor family %d: mcelog does not support this processor. Please use the edac_mce_amd module instead.\n", family);
|
|
|
09c1d0 |
return 0;
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
} else if (!strcmp(vendor,"GenuineIntel"))
|
|
|
09c1d0 |
@@ -741,7 +748,7 @@ restart:
|
|
|
09c1d0 |
else
|
|
|
09c1d0 |
s += 3;
|
|
|
09c1d0 |
|
|
|
09c1d0 |
- n = sscanf(s, "%02x:<%016Lx> {%100s}%n",
|
|
|
09c1d0 |
+ n = sscanf(s, "%02x:<%016Lx> {%99s}%n",
|
|
|
09c1d0 |
&cs,
|
|
|
09c1d0 |
&m.ip,
|
|
|
09c1d0 |
symbol, &next;;
|
|
|
09c1d0 |
@@ -1377,7 +1384,7 @@ int main(int ac, char **av)
|
|
|
09c1d0 |
|
|
|
09c1d0 |
d.buf = xalloc(d.recordlen * d.loglen);
|
|
|
09c1d0 |
if (daemon_mode) {
|
|
|
09c1d0 |
- prefill_memdb();
|
|
|
09c1d0 |
+ prefill_memdb(do_dmi);
|
|
|
09c1d0 |
if (!do_dmi)
|
|
|
09c1d0 |
closedmi();
|
|
|
09c1d0 |
server_setup();
|
|
|
09c1d0 |
diff --git a/mcelog.conf b/mcelog.conf
|
|
|
09c1d0 |
index 6a2be26..f8abb99 100644
|
|
|
09c1d0 |
--- a/mcelog.conf
|
|
|
09c1d0 |
+++ b/mcelog.conf
|
|
|
09c1d0 |
@@ -9,36 +9,36 @@
|
|
|
09c1d0 |
# white space is not allowed in value currently, except at the end where it is dropped
|
|
|
09c1d0 |
#
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-# in general all command line options that are not commands work here
|
|
|
09c1d0 |
-# see man mcelog or mcelog --help for a list
|
|
|
09c1d0 |
+# In general all command line options that are not commands work here.
|
|
|
09c1d0 |
+# See man mcelog or mcelog --help for a list.
|
|
|
09c1d0 |
# e.g. to enable the --no-syslog option use
|
|
|
09c1d0 |
#no-syslog = yes (or no to disable)
|
|
|
09c1d0 |
# when the option has a argument
|
|
|
09c1d0 |
#logfile = /tmp/logfile
|
|
|
09c1d0 |
-# below are the options which are not command line options
|
|
|
09c1d0 |
+# below are the options which are not command line options.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
# Set CPU type for which mcelog decodes events:
|
|
|
09c1d0 |
#cpu = type
|
|
|
09c1d0 |
-# for valid values for type please see mcelog --help
|
|
|
09c1d0 |
+# For valid values for type please see mcelog --help.
|
|
|
09c1d0 |
# If this value is set incorrectly the decoded output will be likely incorrect.
|
|
|
09c1d0 |
-# by default when this parameter is not set mcelog uses the CPU it is running on
|
|
|
09c1d0 |
+# By default when this parameter is not set mcelog uses the CPU it is running on
|
|
|
09c1d0 |
# on very new kernels the mcelog events reported by the kernel also carry
|
|
|
09c1d0 |
# the CPU type which is used too when available and not overriden.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
# Enable daemon mode:
|
|
|
09c1d0 |
#daemon = yes
|
|
|
09c1d0 |
# By default mcelog just processes the currently pending events and exits.
|
|
|
09c1d0 |
-# in daemon mode it will keep running as a daemon in the background and poll
|
|
|
09c1d0 |
+# In daemon mode it will keep running as a daemon in the background and poll
|
|
|
09c1d0 |
# the kernel for events and then decode them.
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-# Filter out known broken events by default
|
|
|
09c1d0 |
+# Filter out known broken events by default.
|
|
|
09c1d0 |
filter = yes
|
|
|
09c1d0 |
-# don't log memory errors individually
|
|
|
09c1d0 |
-# they still get accounted if that is enabled
|
|
|
09c1d0 |
+# Don't log memory errors individually.
|
|
|
09c1d0 |
+# They still get accounted if that is enabled.
|
|
|
09c1d0 |
#filter-memory-errors = yes
|
|
|
09c1d0 |
|
|
|
09c1d0 |
# output in undecoded raw format to be easier machine readable
|
|
|
09c1d0 |
-# (default is decoded)
|
|
|
09c1d0 |
+# (default is decoded).
|
|
|
09c1d0 |
#raw = yes
|
|
|
09c1d0 |
|
|
|
09c1d0 |
# Set CPU Mhz to decode uptime from time stamp counter (output
|
|
|
09c1d0 |
@@ -62,16 +62,17 @@ filter = yes
|
|
|
09c1d0 |
# Append log output to logfile instead of stdout. Only when no syslog logging is active
|
|
|
09c1d0 |
#logfile = filename
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-# Use SMBIOS information to decode DIMMs (needs root)
|
|
|
09c1d0 |
-# This function is not recommended to use right now and generally not needed
|
|
|
09c1d0 |
+# Use SMBIOS information to decode DIMMs (needs root).
|
|
|
09c1d0 |
+# This function is not recommended to use right now and generally not needed.
|
|
|
09c1d0 |
# The exception is memdb prepopulation, which is configured separately below.
|
|
|
09c1d0 |
#dmi = no
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-# when in daemon mode run as this user after set up
|
|
|
09c1d0 |
-# note that the triggers will run as this user too
|
|
|
09c1d0 |
-# setting this to non root will mean that triggers cannot take some corrective
|
|
|
09c1d0 |
-# action, like offlining objects
|
|
|
09c1d0 |
+# When in daemon mode run as this user after set up.
|
|
|
09c1d0 |
+# Note that the triggers will run as this user too.
|
|
|
09c1d0 |
+# Setting this to non root will mean that triggers cannot take some corrective
|
|
|
09c1d0 |
+# action, like offlining objects.
|
|
|
09c1d0 |
#run-credentials-user = root
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
# group to run as daemon with
|
|
|
09c1d0 |
# default to the group of the run-credentials-user
|
|
|
09c1d0 |
#run-credentials-group = nobody
|
|
|
09c1d0 |
@@ -79,72 +80,88 @@ filter = yes
|
|
|
09c1d0 |
[server]
|
|
|
09c1d0 |
# user allowed to access client socket.
|
|
|
09c1d0 |
# when set to * match any
|
|
|
09c1d0 |
-# root is always allowed to access
|
|
|
09c1d0 |
+# root is always allowed to access.
|
|
|
09c1d0 |
# default: root only
|
|
|
09c1d0 |
client-user = root
|
|
|
09c1d0 |
# group allowed to access mcelog
|
|
|
09c1d0 |
-# when no group is configured any group matches (but still user checking)
|
|
|
09c1d0 |
+# When no group is configured any group matches (but still user checking).
|
|
|
09c1d0 |
# when set to * match any
|
|
|
09c1d0 |
#client-group = root
|
|
|
09c1d0 |
-# path to the unix socket for client<->server communication
|
|
|
09c1d0 |
-# when no socket-path is configured the server will not start
|
|
|
09c1d0 |
+# Path to the unix socket for client<->server communication.
|
|
|
09c1d0 |
+# When no socket-path is configured the server will not start
|
|
|
09c1d0 |
#socket-path = /var/run/mcelog-client
|
|
|
09c1d0 |
-# when mcelog starts it checks if a server is already running. timeout
|
|
|
09c1d0 |
+# When mcelog starts it checks if a server is already running. This configures the timeout
|
|
|
09c1d0 |
# for this check.
|
|
|
09c1d0 |
#initial-ping-timeout = 2
|
|
|
09c1d0 |
#
|
|
|
09c1d0 |
[dimm]
|
|
|
09c1d0 |
# Is the in memory DIMM error tracking enabled?
|
|
|
09c1d0 |
# Only works on systems with integrated memory controller and
|
|
|
09c1d0 |
-# which are supported
|
|
|
09c1d0 |
-# Only takes effect in daemon mode
|
|
|
09c1d0 |
+# which are supported.
|
|
|
09c1d0 |
+# Only takes effect in daemon mode.
|
|
|
09c1d0 |
dimm-tracking-enabled = yes
|
|
|
09c1d0 |
-# Use DMI information from the BIOS to prepopulate DIMM database
|
|
|
09c1d0 |
+# Use DMI information from the BIOS to prepopulate DIMM database.
|
|
|
09c1d0 |
# Note this might not work with all BIOS and requires mcelog to run as root.
|
|
|
09c1d0 |
# Alternative is to let mcelog create DIMM objects on demand.
|
|
|
09c1d0 |
dmi-prepopulate = yes
|
|
|
09c1d0 |
#
|
|
|
09c1d0 |
-# execute these triggers when the rate of corrected or uncorrected
|
|
|
09c1d0 |
-# errors per DIMM exceeds the threshold
|
|
|
09c1d0 |
+# Execute these triggers when the rate of corrected or uncorrected
|
|
|
09c1d0 |
+# Errors per DIMM exceeds the threshold.
|
|
|
09c1d0 |
# Note when the hardware does not report DIMMs this might also
|
|
|
09c1d0 |
-# be per channel
|
|
|
09c1d0 |
+# be per channel.
|
|
|
09c1d0 |
# The default of 10/24h is reasonable for server quality
|
|
|
09c1d0 |
-# DDR3 DIMMs as of 2009/10
|
|
|
09c1d0 |
+# DDR3 DIMMs as of 2009/10.
|
|
|
09c1d0 |
#uc-error-trigger = dimm-error-trigger
|
|
|
09c1d0 |
uc-error-threshold = 1 / 24h
|
|
|
09c1d0 |
#ce-error-trigger = dimm-error-trigger
|
|
|
09c1d0 |
ce-error-threshold = 10 / 24h
|
|
|
09c1d0 |
|
|
|
09c1d0 |
[socket]
|
|
|
09c1d0 |
-# Memory error accounting per socket
|
|
|
09c1d0 |
+# Enable memory error accounting per socket.
|
|
|
09c1d0 |
socket-tracking-enabled = yes
|
|
|
09c1d0 |
-# Threshold and trigger for uncorrected memory errors on a socket
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+# Threshold and trigger for uncorrected memory errors on a socket.
|
|
|
09c1d0 |
# mem-uc-error-trigger = socket-memory-error-trigger
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
mem-uc-error-threshold = 100 / 24h
|
|
|
09c1d0 |
-# Threshold and trigger for corrected memory errors on a socket
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+# Trigger script for corrected memory errors on a socket.
|
|
|
09c1d0 |
mem-ce-error-trigger = socket-memory-error-trigger
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+# Threshold on when to trigger a correct error for the socket.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
mem-ce-error-threshold = 100 / 24h
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
# Log socket error threshold explicitely?
|
|
|
09c1d0 |
mem-ce-error-log = yes
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+# Trigger script for uncorrected bus error events
|
|
|
09c1d0 |
bus-uc-threshold-trigger = bus-error-trigger
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+# Trigger script for uncorrected IOMCA erors
|
|
|
09c1d0 |
iomca-threshold-trigger = iomca-error-trigger
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+# Trigger script for other uncategorized errors
|
|
|
09c1d0 |
unknown-threshold-trigger = unknown-error-trigger
|
|
|
09c1d0 |
|
|
|
09c1d0 |
[cache]
|
|
|
09c1d0 |
-# Processing of cache error thresholds reported by Intel CPUs
|
|
|
09c1d0 |
+# Processing of cache error thresholds reported by Intel CPUs.
|
|
|
09c1d0 |
cache-threshold-trigger = cache-error-trigger
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
# Should cache threshold events be logged explicitely?
|
|
|
09c1d0 |
cache-threshold-log = yes
|
|
|
09c1d0 |
|
|
|
09c1d0 |
[page]
|
|
|
09c1d0 |
-# Memory error accouting per 4K memory page
|
|
|
09c1d0 |
-# Threshold for the correct memory errors trigger script
|
|
|
09c1d0 |
+# Memory error accouting per 4K memory page.
|
|
|
09c1d0 |
+# Threshold for the correct memory errors trigger script.
|
|
|
09c1d0 |
memory-ce-threshold = 10 / 24h
|
|
|
09c1d0 |
-# Trigger script for corrected errors
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+# Trigger script for corrected errors.
|
|
|
09c1d0 |
# memory-ce-trigger = page-error-trigger
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
# Should page threshold events be logged explicitely?
|
|
|
09c1d0 |
memory-ce-log = yes
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
# specify the internal action in mcelog to exceeding a page error threshold
|
|
|
09c1d0 |
# this is done in addition to executing the trigger script if available
|
|
|
09c1d0 |
# off no action
|
|
|
09c1d0 |
diff --git a/mcelog.conf.5 b/mcelog.conf.5
|
|
|
09c1d0 |
new file mode 100644
|
|
|
09c1d0 |
index 0000000..5a9afda
|
|
|
09c1d0 |
--- /dev/null
|
|
|
09c1d0 |
+++ b/mcelog.conf.5
|
|
|
09c1d0 |
@@ -0,0 +1,283 @@
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+." Auto generated mcelog.conf manpage. Do not edit.
|
|
|
09c1d0 |
+.TH "mcelog.conf" 5 "mcelog"
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.SH NAME
|
|
|
09c1d0 |
+mcelog.conf \- mcelog.conf reference
|
|
|
09c1d0 |
+.SH SYNOPSIS
|
|
|
09c1d0 |
+.B /etc/mcelog.conf
|
|
|
09c1d0 |
+.SH DESCRIPTION
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+/etc/mcelog.conf is the main configuration file for
|
|
|
09c1d0 |
+.B mcelog(8).
|
|
|
09c1d0 |
+This is configuration file separated into sections including
|
|
|
09c1d0 |
+a default section.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+General format
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B optionname = value
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+White space is not allowed in value currently, except at the end where it is dropped
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+In general all command line options that are not commands work here.
|
|
|
09c1d0 |
+See man mcelog or mcelog --help for a list.
|
|
|
09c1d0 |
+e.g. to enable the --no-syslog option use
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B no-syslog = yes (or no to disable)
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+When the option has a argument
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B logfile = /tmp/logfile
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Below are the options which are not command line options.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Set cpu type for which mcelog decodes events:
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B cpu = type
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+For valid values for type please see mcelog --help.
|
|
|
09c1d0 |
+If this value is set incorrectly the decoded output will be likely incorrect.
|
|
|
09c1d0 |
+By default when this parameter is not set mcelog uses the CPU it is running on
|
|
|
09c1d0 |
+on very new kernels the mcelog events reported by the kernel also carry
|
|
|
09c1d0 |
+the CPU type which is used too when available and not overriden.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Enable daemon mode:
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B daemon = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+By default mcelog just processes the currently pending events and exits.
|
|
|
09c1d0 |
+In daemon mode it will keep running as a daemon in the background and poll
|
|
|
09c1d0 |
+the kernel for events and then decode them.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Filter out known broken events by default.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B filter = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Don't log memory errors individually.
|
|
|
09c1d0 |
+They still get accounted if that is enabled.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B filter-memory-errors = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Output in undecoded raw format to be easier machine readable
|
|
|
09c1d0 |
+(default is decoded).
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B raw = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Set cpu mhz to decode uptime from time stamp counter (output
|
|
|
09c1d0 |
+unreliable, not needed on new kernels which report the event time
|
|
|
09c1d0 |
+directly. A lot of systems don't have a linear time stamp clock
|
|
|
09c1d0 |
+and the output is wrong then.
|
|
|
09c1d0 |
+Normally mcelog tries to figure out if it the TSC is reliable
|
|
|
09c1d0 |
+and only uses the current frequency then.
|
|
|
09c1d0 |
+Setting a frequency forces timestamp decoding.
|
|
|
09c1d0 |
+This setting is obsolete with modern kernels which report the time
|
|
|
09c1d0 |
+directly.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B cpumhz = 1800.00
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Log output options
|
|
|
09c1d0 |
+Log decoded machine checks in syslog (default stdout or syslog for daemon)
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B syslog = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Log decoded machine checks in syslog with error level
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B syslog-error = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Never log anything to syslog
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B no-syslog = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Append log output to logfile instead of stdout. only when no syslog logging is active
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B logfile = filename
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Use smbios information to decode dimms (needs root).
|
|
|
09c1d0 |
+This function is not recommended to use right now and generally not needed.
|
|
|
09c1d0 |
+The exception is memdb prepopulation, which is configured separately below.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B dmi = no
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+When in daemon mode run as this user after set up.
|
|
|
09c1d0 |
+Note that the triggers will run as this user too.
|
|
|
09c1d0 |
+Setting this to non root will mean that triggers cannot take some corrective
|
|
|
09c1d0 |
+action, like offlining objects.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B run-credentials-user = root
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Group to run as daemon with
|
|
|
09c1d0 |
+default to the group of the run-credentials-user
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B run-credentials-group = nobody
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.SS "The server config section"
|
|
|
09c1d0 |
+User allowed to access client socket.
|
|
|
09c1d0 |
+when set to * match any
|
|
|
09c1d0 |
+root is always allowed to access.
|
|
|
09c1d0 |
+default: root only
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B client-user = root
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Group allowed to access mcelog
|
|
|
09c1d0 |
+When no group is configured any group matches (but still user checking).
|
|
|
09c1d0 |
+when set to * match any
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B client-group = root
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Path to the unix socket for client<->server communication.
|
|
|
09c1d0 |
+When no socket-path is configured the server will not start
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B socket-path = /var/run/mcelog-client
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+When mcelog starts it checks if a server is already running. this configures the timeout
|
|
|
09c1d0 |
+for this check.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B initial-ping-timeout = 2
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.SS "The dimm config section"
|
|
|
09c1d0 |
+Is the in memory dimm error tracking enabled?
|
|
|
09c1d0 |
+Only works on systems with integrated memory controller and
|
|
|
09c1d0 |
+which are supported.
|
|
|
09c1d0 |
+Only takes effect in daemon mode.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B dimm-tracking-enabled = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Use dmi information from the bios to prepopulate dimm database.
|
|
|
09c1d0 |
+Note this might not work with all BIOS and requires mcelog to run as root.
|
|
|
09c1d0 |
+Alternative is to let mcelog create DIMM objects on demand.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B dmi-prepopulate = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+Execute these triggers when the rate of corrected or uncorrected
|
|
|
09c1d0 |
+Errors per DIMM exceeds the threshold.
|
|
|
09c1d0 |
+Note when the hardware does not report DIMMs this might also
|
|
|
09c1d0 |
+be per channel.
|
|
|
09c1d0 |
+The default of 10/24h is reasonable for server quality
|
|
|
09c1d0 |
+DDR3 DIMMs as of 2009/10.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B uc-error-trigger = dimm-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B uc-error-threshold = 1 / 24h
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B ce-error-trigger = dimm-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B ce-error-threshold = 10 / 24h
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.SS "The socket config section"
|
|
|
09c1d0 |
+Enable memory error accounting per socket.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B socket-tracking-enabled = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Threshold and trigger for uncorrected memory errors on a socket.
|
|
|
09c1d0 |
+mem-uc-error-trigger = socket-memory-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B mem-uc-error-threshold = 100 / 24h
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Trigger script for corrected memory errors on a socket.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B mem-ce-error-trigger = socket-memory-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Threshold on when to trigger a correct error for the socket.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B mem-ce-error-threshold = 100 / 24h
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+ log socket error threshold explicitely?
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B mem-ce-error-log = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Trigger script for uncorrected bus error events
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B bus-uc-threshold-trigger = bus-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Trigger script for uncorrected iomca erors
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B iomca-threshold-trigger = iomca-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Trigger script for other uncategorized errors
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B unknown-threshold-trigger = unknown-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.SS "The cache config section"
|
|
|
09c1d0 |
+Processing of cache error thresholds reported by intel cpus.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B cache-threshold-trigger = cache-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Should cache threshold events be logged explicitely?
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B cache-threshold-log = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.SS "The page config section"
|
|
|
09c1d0 |
+Memory error accouting per 4k memory page.
|
|
|
09c1d0 |
+Threshold for the correct memory errors trigger script.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B memory-ce-threshold = 10 / 24h
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Trigger script for corrected errors.
|
|
|
09c1d0 |
+memory-ce-trigger = page-error-trigger
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Should page threshold events be logged explicitely?
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B memory-ce-log = yes
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Specify the internal action in mcelog to exceeding a page error threshold
|
|
|
09c1d0 |
+this is done in addition to executing the trigger script if available
|
|
|
09c1d0 |
+off no action
|
|
|
09c1d0 |
+account only account errors
|
|
|
09c1d0 |
+soft try to soft-offline page without killing any processes
|
|
|
09c1d0 |
+ This requires an uptodate kernel. Might not be successfull.
|
|
|
09c1d0 |
+hard try to hard-offline page by killing processes
|
|
|
09c1d0 |
+ Requires an uptodate kernel. Might not be successfull.
|
|
|
09c1d0 |
+soft-then-hard First try to soft offline, then try hard offlining
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B memory-ce-action = off|account|soft|hard|soft-then-hard
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B memory-ce-action = soft
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.SS "The trigger config section"
|
|
|
09c1d0 |
+Maximum number of running triggers
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B children-max = 2
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Execute triggers in this directory
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B directory = /etc/mcelog
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.SH SEE ALSO
|
|
|
09c1d0 |
+.BR mcelog (8)
|
|
|
09c1d0 |
+,
|
|
|
09c1d0 |
+.B http://www.mcelog.org
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
diff --git a/mcelog.h b/mcelog.h
|
|
|
09c1d0 |
index 550a0a5..6c097cf 100644
|
|
|
09c1d0 |
--- a/mcelog.h
|
|
|
09c1d0 |
+++ b/mcelog.h
|
|
|
09c1d0 |
@@ -65,14 +65,18 @@ struct mce {
|
|
|
09c1d0 |
#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
|
|
|
09c1d0 |
#define MCI_STATUS_S (1ULL<<56) /* signalled */
|
|
|
09c1d0 |
#define MCI_STATUS_AR (1ULL<<55) /* action-required */
|
|
|
09c1d0 |
+#define MCI_STATUS_FWST (1ULL<<37) /* Firmware updated status indicator */
|
|
|
09c1d0 |
|
|
|
09c1d0 |
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
|
|
|
09c1d0 |
#define MCG_STATUS_EIPV (1ULL<<1) /* eip points to correct instruction */
|
|
|
09c1d0 |
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
|
|
|
09c1d0 |
+#define MCG_STATUS_LMCES (1ULL<<3) /* local machine check signaled */
|
|
|
09c1d0 |
|
|
|
09c1d0 |
#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
|
|
|
09c1d0 |
#define MCG_TES_P (1ULL<<11) /* Yellow bit cache threshold supported */
|
|
|
09c1d0 |
#define MCG_SER_P (1ULL<<24) /* MCA recovery / new status */
|
|
|
09c1d0 |
+#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */
|
|
|
09c1d0 |
+#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */
|
|
|
09c1d0 |
|
|
|
09c1d0 |
#define NELE(x) (sizeof(x)/sizeof(*(x)))
|
|
|
09c1d0 |
#define err(x) perror(x),exit(1)
|
|
|
09c1d0 |
@@ -119,6 +123,9 @@ enum cputype {
|
|
|
09c1d0 |
CPU_IVY_BRIDGE_EPEX,
|
|
|
09c1d0 |
CPU_HASWELL,
|
|
|
09c1d0 |
CPU_HASWELL_EPEX,
|
|
|
09c1d0 |
+ CPU_BROADWELL,
|
|
|
09c1d0 |
+ CPU_KNIGHTS_LANDING,
|
|
|
09c1d0 |
+ CPU_ATOM,
|
|
|
09c1d0 |
};
|
|
|
09c1d0 |
|
|
|
09c1d0 |
enum option_ranges {
|
|
|
09c1d0 |
diff --git a/mcelog.service b/mcelog.service
|
|
|
09c1d0 |
new file mode 100644
|
|
|
09c1d0 |
index 0000000..c5aaf07
|
|
|
09c1d0 |
--- /dev/null
|
|
|
09c1d0 |
+++ b/mcelog.service
|
|
|
09c1d0 |
@@ -0,0 +1,10 @@
|
|
|
09c1d0 |
+[Unit]
|
|
|
09c1d0 |
+Description=Machine Check Exception Logging Daemon
|
|
|
09c1d0 |
+After=syslog.target
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+[Service]
|
|
|
09c1d0 |
+ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground
|
|
|
09c1d0 |
+StandardOutput=syslog
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+[Install]
|
|
|
09c1d0 |
+WantedBy=multi-user.target
|
|
|
09c1d0 |
diff --git a/mcelog.triggers.5 b/mcelog.triggers.5
|
|
|
09c1d0 |
new file mode 100644
|
|
|
09c1d0 |
index 0000000..510bbef
|
|
|
09c1d0 |
--- /dev/null
|
|
|
09c1d0 |
+++ b/mcelog.triggers.5
|
|
|
09c1d0 |
@@ -0,0 +1,231 @@
|
|
|
09c1d0 |
+'\" t
|
|
|
09c1d0 |
+.TH "mcelog.triggers" 5 "mcelog"
|
|
|
09c1d0 |
+.SH NAME
|
|
|
09c1d0 |
+mcelog.triggers \- mcelog trigger scripts reference
|
|
|
09c1d0 |
+.SH SYNOPSIS
|
|
|
09c1d0 |
+.B /etc/mcelog/bus-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.B /etc/mcelog/cache-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.B /etc/mcelog/dimm-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.B /etc/mcelog/iomca-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.B /etc/mcelog/page-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.B /etc/mcelog/socket-memory-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.B /etc/mcelog/unknown-error-trigger
|
|
|
09c1d0 |
+.br
|
|
|
09c1d0 |
+.SH DESCRIPTION
|
|
|
09c1d0 |
+.BR mcelog(8)
|
|
|
09c1d0 |
+maintains thresholds of errors using a
|
|
|
09c1d0 |
+.I leaky-bucket
|
|
|
09c1d0 |
+algorithm.
|
|
|
09c1d0 |
+When the number of errors in a specific
|
|
|
09c1d0 |
+time window exceeds a pre-configured threshold a
|
|
|
09c1d0 |
+.I trigger
|
|
|
09c1d0 |
+will be executed. Triggers are usually shell scripts in the
|
|
|
09c1d0 |
+.B /etc/mcelog
|
|
|
09c1d0 |
+directory
|
|
|
09c1d0 |
+but can be also other internal actions. Thresholds and triggers
|
|
|
09c1d0 |
+can be configured in
|
|
|
09c1d0 |
+.BR mcelog.conf(5)
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+Trigger will run as the user configured for mcelog
|
|
|
09c1d0 |
+in
|
|
|
09c1d0 |
+.I mcelog.conf,
|
|
|
09c1d0 |
+by default root. The default trigger action can
|
|
|
09c1d0 |
+be overridden by specifying a different trigger script in the configuration file.
|
|
|
09c1d0 |
+Actions in addition to the default trigger
|
|
|
09c1d0 |
+(like notifying an administrator) can be put into the respective
|
|
|
09c1d0 |
+.I /etc/mcelog/*.local
|
|
|
09c1d0 |
+script which is executed after the default action. This allows updating the default
|
|
|
09c1d0 |
+scripts without overriding local actions. All trigger actions are also
|
|
|
09c1d0 |
+logged to syslog.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B "The DIMM and socket memory error triggers"
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The
|
|
|
09c1d0 |
+.B /etc/mcelog/dimm-error-trigger
|
|
|
09c1d0 |
+and
|
|
|
09c1d0 |
+.B /etc/mcelog/socket-memory-error-trigger
|
|
|
09c1d0 |
+scripts are executed when a DIMM or a CPU socket exceeds
|
|
|
09c1d0 |
+a configured corrected or uncorrected memory error threshold.
|
|
|
09c1d0 |
+The thresholds are configured in the
|
|
|
09c1d0 |
+.B mcelog.conf
|
|
|
09c1d0 |
+.I [dimm]
|
|
|
09c1d0 |
+and
|
|
|
09c1d0 |
+.I [socket]
|
|
|
09c1d0 |
+sections.
|
|
|
09c1d0 |
+The default triggers log a warning message in the system log.
|
|
|
09c1d0 |
+The triggers are only executed when mcelog runs as a daemon.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+Arguments are passed as environment variables
|
|
|
09c1d0 |
+.TS
|
|
|
09c1d0 |
+tab(:);
|
|
|
09c1d0 |
+l l.
|
|
|
09c1d0 |
+THRESHOLD:human readable threshold status
|
|
|
09c1d0 |
+MESSAGE:Human readable consolidated error message
|
|
|
09c1d0 |
+TOTALCOUNT:total corrected or uncorrected count of errors for current DIMM depending on what triggered the event
|
|
|
09c1d0 |
+LOCATION:Consolidated location as a single string
|
|
|
09c1d0 |
+DMI_LOCATION:DIMM location from DMI/SMBIOS if available
|
|
|
09c1d0 |
+DMI_NAME:DIMM identifier from DMI/SMBIOS if available
|
|
|
09c1d0 |
+DIMM:DIMM number reported by hardware
|
|
|
09c1d0 |
+CHANNEL:Channel number reported by hardware
|
|
|
09c1d0 |
+SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM
|
|
|
09c1d0 |
+CECOUNT:Total corrected error count for DIMM
|
|
|
09c1d0 |
+UCCOUNT:Total uncorrected error count for DIMM
|
|
|
09c1d0 |
+LASTEVENT:Time stamp of event that triggered threshold (in time_t format, seconds)
|
|
|
09c1d0 |
+THRESHOLD_COUNT:Total umber of events in current threshold time period of specific type
|
|
|
09c1d0 |
+.TE
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+After the default action local actions in
|
|
|
09c1d0 |
+.B /etc/mcelog/dimm-error-trigger.local
|
|
|
09c1d0 |
+or respective
|
|
|
09c1d0 |
+.B /etc/mcelog/socket-memory-error-trigger.local
|
|
|
09c1d0 |
+are executed.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B "The page error trigger"
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The
|
|
|
09c1d0 |
+.B /etc/mcelog/page-error-trigger
|
|
|
09c1d0 |
+script is
|
|
|
09c1d0 |
+executed by mcelog in daemon mode when a page
|
|
|
09c1d0 |
+in memory exceeds a pre-configured corrected or uncorrected error threshold.
|
|
|
09c1d0 |
+mcelog internally also implements offlining the page through the kernel.
|
|
|
09c1d0 |
+This is configured through the
|
|
|
09c1d0 |
+.I [page]
|
|
|
09c1d0 |
+section of
|
|
|
09c1d0 |
+.BR mcelog.conf(5)
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The environment arguments are the same as for the
|
|
|
09c1d0 |
+.I dimm-error-trigger
|
|
|
09c1d0 |
+script
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+After the default action local actions in
|
|
|
09c1d0 |
+.I /etc/mcelog/page-error-trigger.loccal are executed.
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B "The cache error trigger"
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The
|
|
|
09c1d0 |
+.I /etc/mcelog/cache-error-trigger
|
|
|
09c1d0 |
+shell script is called for cache error handling in daemon mode
|
|
|
09c1d0 |
+when a CPU reports excessive corrected cache errors.
|
|
|
09c1d0 |
+This could be a indication for future uncorrected errors.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+This trigger is configured through the
|
|
|
09c1d0 |
+.B [cache]
|
|
|
09c1d0 |
+section in the
|
|
|
09c1d0 |
+.BR mcelog.conf(5)
|
|
|
09c1d0 |
+configuration file. The threshold is defined by the CPU. The default trigger offlines the affected CPU cores, unless it is the last core running.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Arguments are passed as environment variables
|
|
|
09c1d0 |
+.TS
|
|
|
09c1d0 |
+tab(:);
|
|
|
09c1d0 |
+l l.
|
|
|
09c1d0 |
+MESSAGE:Human readable error message
|
|
|
09c1d0 |
+CPU:Linux CPU number that triggered the error
|
|
|
09c1d0 |
+LEVEL:Cache level affected by error
|
|
|
09c1d0 |
+TYPE:Cache type affected by error (Data,Instruction,Generic)
|
|
|
09c1d0 |
+AFFECTED_CPUS:List of CPUs sharing the affected cache
|
|
|
09c1d0 |
+SOCKETID:Socket ID of affected CPU
|
|
|
09c1d0 |
+.TE
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+After the default action local actions in
|
|
|
09c1d0 |
+.I /etc/mcelog/cache-error-trigger.local are executed.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B "The bus-uc-threshold-trigger"
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The
|
|
|
09c1d0 |
+.B bus-uc-threshold-trigger
|
|
|
09c1d0 |
+runs on uncorrected errors on a IO bus. It is configured through the
|
|
|
09c1d0 |
+.B bus-uc-threshold-trigger
|
|
|
09c1d0 |
+and
|
|
|
09c1d0 |
+.B bus-uc-threshold-trigger-threshold
|
|
|
09c1d0 |
+options in
|
|
|
09c1d0 |
+.I /etc/mcelog.conf(5).
|
|
|
09c1d0 |
+By default it logs a message with the error location to the system log.
|
|
|
09c1d0 |
+After the default action local actions in
|
|
|
09c1d0 |
+.I /etc/mcelog/bus-uc-error-trigger.local
|
|
|
09c1d0 |
+are executed.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Arguments are passed as environment variables
|
|
|
09c1d0 |
+.TS
|
|
|
09c1d0 |
+tab(:);
|
|
|
09c1d0 |
+l l.
|
|
|
09c1d0 |
+MESSAGE:Human readable consolidated error message.
|
|
|
09c1d0 |
+LOCATION:Consolidated location as a single string
|
|
|
09c1d0 |
+SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM
|
|
|
09c1d0 |
+LEVEL:Interconnect level
|
|
|
09c1d0 |
+PARTICIPATION:Processor Participation (Originator, Responder or Observer)
|
|
|
09c1d0 |
+REQUEST:Request type (read, write, prefetch, etc.)
|
|
|
09c1d0 |
+ORIGIN :Memory or IO
|
|
|
09c1d0 |
+TIMEOUT:The request timed out or not
|
|
|
09c1d0 |
+.TE
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B "The iomca-error-trigger"
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The
|
|
|
09c1d0 |
+.B iomca-error-trigger
|
|
|
09c1d0 |
+runs when a socket receives bus or interconnect errors.
|
|
|
09c1d0 |
+It is configured through the
|
|
|
09c1d0 |
+.B iomca-error-trigger
|
|
|
09c1d0 |
+and
|
|
|
09c1d0 |
+.B iomca-error-trigger-threshold
|
|
|
09c1d0 |
+options in
|
|
|
09c1d0 |
+.I /etc/mcelog.conf. By default it logs a message with the error location to the system log.
|
|
|
09c1d0 |
+After the default action local actions in
|
|
|
09c1d0 |
+.I /etc/mcelog/iomca-error-trigger.local are executed.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Arguments are passed as environment variables
|
|
|
09c1d0 |
+.TS
|
|
|
09c1d0 |
+tab(:);
|
|
|
09c1d0 |
+l l.
|
|
|
09c1d0 |
+MESSAGE:Human readable consolidated error message
|
|
|
09c1d0 |
+LOCATION:Consolidated location as a single string
|
|
|
09c1d0 |
+SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM
|
|
|
09c1d0 |
+CPU:Linux CPU number that triggered the error
|
|
|
09c1d0 |
+SET:PCI segment number
|
|
|
09c1d0 |
+BUS:PCI bus number
|
|
|
09c1d0 |
+DEVICE:PCI device number
|
|
|
09c1d0 |
+FUNCTION:PCI function number
|
|
|
09c1d0 |
+.TE
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+.B "The unknown-error-trigger"
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+The
|
|
|
09c1d0 |
+.B unknown-error-trigger
|
|
|
09c1d0 |
+runs on any errors not otherwise categorized.
|
|
|
09c1d0 |
+It is configured through the
|
|
|
09c1d0 |
+.B unknown-error-trigger
|
|
|
09c1d0 |
+and
|
|
|
09c1d0 |
+.B unknown-error-trigger-threshold
|
|
|
09c1d0 |
+options in
|
|
|
09c1d0 |
+.I /etc/mcelog.conf.
|
|
|
09c1d0 |
+By default it logs a message to the system log.
|
|
|
09c1d0 |
+After the default action local actions in
|
|
|
09c1d0 |
+.I /etc/mcelog/unknown-error-trigger.local
|
|
|
09c1d0 |
+are executed.
|
|
|
09c1d0 |
+.PP
|
|
|
09c1d0 |
+Arguments are passed as environment variables
|
|
|
09c1d0 |
+.TS
|
|
|
09c1d0 |
+tab(:);
|
|
|
09c1d0 |
+l l.
|
|
|
09c1d0 |
+MESSAGE:Human readable consolidated error message
|
|
|
09c1d0 |
+LOCATION:Consolidated location as a single string
|
|
|
09c1d0 |
+SOCKETID:Socket ID of CPU that includes the memory controller with the DIMM
|
|
|
09c1d0 |
+CPU:Linux CPU number that triggered the error
|
|
|
09c1d0 |
+STATUS:IA32_MCi_STATUS register value
|
|
|
09c1d0 |
+ADDR:IA32_MCi_ADDR register value
|
|
|
09c1d0 |
+MISC:IA32_MCi_MISC register value
|
|
|
09c1d0 |
+MCGSTATUS:IA32_MCG_STATUS register value
|
|
|
09c1d0 |
+MCGCAP:IA32_MCG_CAP register value
|
|
|
09c1d0 |
+.TE
|
|
|
09c1d0 |
+.SH SEE ALSO
|
|
|
09c1d0 |
+http://www.mcelog.org
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+.B mcelog(8),
|
|
|
09c1d0 |
+.B mcelog.conf(5)
|
|
|
09c1d0 |
diff --git a/memdb.c b/memdb.c
|
|
|
09c1d0 |
index bde8113..7a33750 100644
|
|
|
09c1d0 |
--- a/memdb.c
|
|
|
09c1d0 |
+++ b/memdb.c
|
|
|
09c1d0 |
@@ -270,6 +270,7 @@ static void dump_errtype(char *name, struct err_type *e, FILE *f, enum printflag
|
|
|
09c1d0 |
int all = (flags & DUMP_ALL);
|
|
|
09c1d0 |
char *s;
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+ bucket_age(bc, &e->bucket, bucket_time());
|
|
|
09c1d0 |
if (e->count || e->bucket.count || all)
|
|
|
09c1d0 |
fprintf(f, "%s:\n", name);
|
|
|
09c1d0 |
if (e->count || all) {
|
|
|
09c1d0 |
@@ -382,7 +383,7 @@ parse_dimm_addr(char *bl, unsigned *socketid, unsigned *channel, unsigned *dimm)
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
/* Prepopulate DIMM database from BIOS information */
|
|
|
09c1d0 |
-void prefill_memdb(void)
|
|
|
09c1d0 |
+void prefill_memdb(int do_dmi)
|
|
|
09c1d0 |
{
|
|
|
09c1d0 |
static int initialized;
|
|
|
09c1d0 |
int i;
|
|
|
09c1d0 |
@@ -395,7 +396,7 @@ void prefill_memdb(void)
|
|
|
09c1d0 |
if (!memdb_enabled)
|
|
|
09c1d0 |
return;
|
|
|
09c1d0 |
initialized = 1;
|
|
|
09c1d0 |
- if (config_bool("dimm", "dmi-prepopulate") == 0)
|
|
|
09c1d0 |
+ if (config_bool("dimm", "dmi-prepopulate") == 0 || !do_dmi)
|
|
|
09c1d0 |
return;
|
|
|
09c1d0 |
if (opendmi() < 0)
|
|
|
09c1d0 |
return;
|
|
|
09c1d0 |
diff --git a/memdb.h b/memdb.h
|
|
|
09c1d0 |
index 5c68581..afc3348 100644
|
|
|
09c1d0 |
--- a/memdb.h
|
|
|
09c1d0 |
+++ b/memdb.h
|
|
|
09c1d0 |
@@ -11,7 +11,7 @@ enum printflags {
|
|
|
09c1d0 |
DUMP_BIOS = (1 << 1),
|
|
|
09c1d0 |
};
|
|
|
09c1d0 |
|
|
|
09c1d0 |
-void prefill_memdb(void);
|
|
|
09c1d0 |
+void prefill_memdb(int do_dmi);
|
|
|
09c1d0 |
void memdb_config(void);
|
|
|
09c1d0 |
void dump_memory_errors(FILE *f, enum printflags flags);
|
|
|
09c1d0 |
|
|
|
09c1d0 |
diff --git a/p4.c b/p4.c
|
|
|
09c1d0 |
index f938196..2bf1eee 100644
|
|
|
09c1d0 |
--- a/p4.c
|
|
|
09c1d0 |
+++ b/p4.c
|
|
|
09c1d0 |
@@ -317,6 +317,10 @@ static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *i
|
|
|
09c1d0 |
if (status & (MCI_STATUS_S|MCI_STATUS_AR))
|
|
|
09c1d0 |
Wprintf("%s\n", arstate[(status >> 55) & 3]);
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+ if ((mcgcap & MCG_SER_P) && (status & MCI_STATUS_FWST)) {
|
|
|
09c1d0 |
+ Wprintf("Firmware may have updated this error\n");
|
|
|
09c1d0 |
+ }
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
if ((mcgcap == 0 || (mcgcap & MCG_TES_P)) && !(status & MCI_STATUS_UC)) {
|
|
|
09c1d0 |
track = (status >> 53) & 3;
|
|
|
09c1d0 |
decode_tracking(track);
|
|
|
09c1d0 |
@@ -334,6 +338,8 @@ static void decode_mcg(__u64 mcgstatus)
|
|
|
09c1d0 |
Wprintf("EIPV ");
|
|
|
09c1d0 |
if (mcgstatus & MCG_STATUS_MCIP)
|
|
|
09c1d0 |
Wprintf("MCIP ");
|
|
|
09c1d0 |
+ if (mcgstatus & MCG_STATUS_LMCES)
|
|
|
09c1d0 |
+ Wprintf("LMCE ");
|
|
|
09c1d0 |
Wprintf("\n");
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
diff --git a/server.c b/server.c
|
|
|
09c1d0 |
index 344eb38..a1fa7da 100644
|
|
|
09c1d0 |
--- a/server.c
|
|
|
09c1d0 |
+++ b/server.c
|
|
|
09c1d0 |
@@ -291,7 +291,7 @@ static int server_ping(struct sockaddr_un *un)
|
|
|
09c1d0 |
{
|
|
|
09c1d0 |
struct sigaction oldsa;
|
|
|
09c1d0 |
struct sigaction sa = { .sa_handler = ping_timeout };
|
|
|
09c1d0 |
- int ret = -1, n;
|
|
|
09c1d0 |
+ int ret, n;
|
|
|
09c1d0 |
char buf[10];
|
|
|
09c1d0 |
int fd = socket(PF_UNIX, SOCK_STREAM, 0);
|
|
|
09c1d0 |
if (fd < 0)
|
|
|
09c1d0 |
@@ -299,6 +299,7 @@ static int server_ping(struct sockaddr_un *un)
|
|
|
09c1d0 |
|
|
|
09c1d0 |
sigaction(SIGALRM, &sa, &oldsa);
|
|
|
09c1d0 |
if (sigsetjmp(ping_timeout_ctx, 1) == 0) {
|
|
|
09c1d0 |
+ ret = 0;
|
|
|
09c1d0 |
alarm(initial_ping_timeout);
|
|
|
09c1d0 |
if (connect(fd, un, sizeof(struct sockaddr_un)) < 0)
|
|
|
09c1d0 |
goto cleanup;
|
|
|
09c1d0 |
@@ -308,7 +309,8 @@ static int server_ping(struct sockaddr_un *un)
|
|
|
09c1d0 |
goto cleanup;
|
|
|
09c1d0 |
if (n == 5 && !memcmp(buf, "pong\n", 5))
|
|
|
09c1d0 |
ret = 0;
|
|
|
09c1d0 |
- }
|
|
|
09c1d0 |
+ } else
|
|
|
09c1d0 |
+ ret = -1;
|
|
|
09c1d0 |
cleanup:
|
|
|
09c1d0 |
sigaction(SIGALRM, &oldsa, NULL);
|
|
|
09c1d0 |
alarm(0);
|
|
|
09c1d0 |
diff --git a/tests/test b/tests/test
|
|
|
09c1d0 |
index 35bebd2..148bf1f 100755
|
|
|
09c1d0 |
--- a/tests/test
|
|
|
09c1d0 |
+++ b/tests/test
|
|
|
09c1d0 |
@@ -17,6 +17,8 @@ if [ "$(whoami)" != "root" ] ; then
|
|
|
09c1d0 |
exit 1
|
|
|
09c1d0 |
fi
|
|
|
09c1d0 |
|
|
|
09c1d0 |
+[ ! -f /dev/mce-inject ] && modprobe mce-inject
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
echo "++++++++++++ running $1 test +++++++++++++++++++"
|
|
|
09c1d0 |
|
|
|
09c1d0 |
# disable trigger
|
|
|
09c1d0 |
diff --git a/trigger.c b/trigger.c
|
|
|
09c1d0 |
index 19466a6..5caca34 100644
|
|
|
09c1d0 |
--- a/trigger.c
|
|
|
09c1d0 |
+++ b/trigger.c
|
|
|
09c1d0 |
@@ -115,11 +115,18 @@ static void finish_child(pid_t child, int status)
|
|
|
09c1d0 |
static void child_handler(int sig, siginfo_t *si, void *ctx)
|
|
|
09c1d0 |
{
|
|
|
09c1d0 |
int status;
|
|
|
09c1d0 |
+ pid_t pid;
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
if (waitpid(si->si_pid, &status, WNOHANG) < 0) {
|
|
|
09c1d0 |
SYSERRprintf("Cannot collect child %d", si->si_pid);
|
|
|
09c1d0 |
return;
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
finish_child(si->si_pid, status);
|
|
|
09c1d0 |
+
|
|
|
09c1d0 |
+ /* Check other child(ren)'s status to avoid zombie process */
|
|
|
09c1d0 |
+ while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
|
|
|
09c1d0 |
+ finish_child(pid, status);
|
|
|
09c1d0 |
+ }
|
|
|
09c1d0 |
}
|
|
|
09c1d0 |
|
|
|
09c1d0 |
void trigger_setup(void)
|
|
|
09c1d0 |
diff --git a/triggers/bus-error-trigger b/triggers/bus-error-trigger
|
|
|
09c1d0 |
old mode 100644
|
|
|
09c1d0 |
new mode 100755
|
|
|
09c1d0 |
diff --git a/triggers/iomca-error-trigger b/triggers/iomca-error-trigger
|
|
|
09c1d0 |
old mode 100644
|
|
|
09c1d0 |
new mode 100755
|
|
|
09c1d0 |
diff --git a/triggers/unknown-error-trigger b/triggers/unknown-error-trigger
|
|
|
09c1d0 |
old mode 100644
|
|
|
09c1d0 |
new mode 100755
|
|
|
09c1d0 |
index b924a0e..fa2866c
|
|
|
09c1d0 |
--- a/triggers/unknown-error-trigger
|
|
|
09c1d0 |
+++ b/triggers/unknown-error-trigger
|
|
|
09c1d0 |
@@ -9,7 +9,7 @@
|
|
|
09c1d0 |
# CPU Linux CPU number that triggered the error
|
|
|
09c1d0 |
# STATUS IA32_MCi_STATUS register value
|
|
|
09c1d0 |
# ADDR IA32_MCi_ADDR register value
|
|
|
09c1d0 |
-# MISC IA32_MCi_MISC regiser value
|
|
|
09c1d0 |
+# MISC IA32_MCi_MISC register value
|
|
|
09c1d0 |
# MCGSTATUS IA32_MCG_STATUS register value
|
|
|
09c1d0 |
# MCGCAP IA32_MCG_CAP register value
|
|
|
09c1d0 |
# For details on the register layout please see the Intel SDM http://www.intel.com/sdm
|