diff -urNp mcelog-d2e13bf0.orig/broadwell_epex.c mcelog-d2e13bf0/broadwell_epex.c
--- mcelog-d2e13bf0.orig/broadwell_epex.c 2016-11-30 11:23:54.542909636 -0500
+++ mcelog-d2e13bf0/broadwell_epex.c 2016-11-30 11:24:12.203619329 -0500
@@ -23,6 +23,11 @@
#include "broadwell_epex.h"
#include "memdb.h"
+/* Memory error was corrected by mirroring with channel failover */
+#define BDW_MCI_MISC_FO (1ULL<<41)
+/* Memory error was corrected by mirroring and primary channel scrubbed successfully */
+#define BDW_MCI_MISC_MC (1ULL<<42)
+
/* See IA32 SDM Vol3B Table 16-20 */
static char *pcu_1[] = {
@@ -147,3 +152,23 @@ void bdw_epex_decode_model(int cputype,
break;
}
}
+
+/*
+ * return: 0 - CE by normal ECC
+ * 1 - CE by mirroring with channel failover
+ * 2 - CE by mirroring and primary channel scrubbed successfully
+ */
+int bdw_epex_ce_type(int bank, u64 status, u64 misc)
+{
+ if (!(bank == 7 || bank == 8))
+ return 0;
+
+ if (status & MCI_STATUS_MISCV) {
+ if (misc & BDW_MCI_MISC_FO)
+ return 1;
+ if (misc & BDW_MCI_MISC_MC)
+ return 2;
+ }
+
+ return 0;
+}
diff -urNp mcelog-d2e13bf0.orig/broadwell_epex.h mcelog-d2e13bf0/broadwell_epex.h
--- mcelog-d2e13bf0.orig/broadwell_epex.h 2016-11-30 11:23:54.542909636 -0500
+++ mcelog-d2e13bf0/broadwell_epex.h 2016-11-30 11:24:12.203619329 -0500
@@ -1 +1,2 @@
void bdw_epex_decode_model(int cputype, int bank, u64 status, u64 misc);
+int bdw_epex_ce_type(int bank, u64 status, u64 misc);
diff -urNp mcelog-d2e13bf0.orig/client.c mcelog-d2e13bf0/client.c
--- mcelog-d2e13bf0.orig/client.c 2016-11-30 11:23:54.530909154 -0500
+++ mcelog-d2e13bf0/client.c 2016-11-30 11:24:12.203619329 -0500
@@ -67,3 +67,11 @@ void ask_server(char *command)
SYSERRprintf("client read");
}
+
+void client_cleanup(void)
+{
+ char *path = config_string("server", "socket-path");
+ if (!path)
+ path = SOCKET_PATH;
+ unlink(path);
+}
diff -urNp mcelog-d2e13bf0.orig/client.h mcelog-d2e13bf0/client.h
--- mcelog-d2e13bf0.orig/client.h 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/client.h 2016-11-30 11:24:12.203619329 -0500
@@ -1 +1,2 @@
void ask_server(char *command);
+void client_cleanup(void);
diff -urNp mcelog-d2e13bf0.orig/db.c mcelog-d2e13bf0/db.c
--- mcelog-d2e13bf0.orig/db.c 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/db.c 1969-12-31 19:00:00.000000000 -0500
@@ -1,599 +0,0 @@
-/* Copyright (C) 2006 Andi Kleen, SuSE Labs.
- Dumb database manager.
- not suitable for large datasets, but human readable files and simple.
- assumes groups and entries-per-group are max low double digits.
- the in memory presentation could be easily optimized with a few
- hashes, but that shouldn't be needed for now.
- Note: obsolete, new design uses in memory databases only
-
- mcelog is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; version
- 2.
-
- mcelog is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should find a copy of v2 of the GNU General Public License somewhere
- on your Linux system; if not, write to the Free Software Foundation,
- Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
-
-/* TBD:
- add lock file to protect final rename
- timeout for locks
-*/
-
-#define _GNU_SOURCE 1
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include <errno.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <sys/fcntl.h>
-#include <sys/file.h>
-#include <assert.h>
-
-#include "db.h"
-#include "memutil.h"
-
-/* file format
-
-# comment
-[group1]
-entry1: value
-entry2: value
-
-# comment
-# comment2
-[group2]
-entry: value
-
-value is anything before new line, but first will be skipped
-spaces are allowed in entry names or groups
-comments are preserved, but moved in front of the group
-blank lines allowed.
-
-code doesnt check for unique records/entries right now. first wins.
-
-*/
-
-struct entry {
- char *name;
- char *val;
-};
-
-struct group {
- struct group *next;
- char *name;
- struct entry *entries;
- char *comment;
- int numentries;
-};
-
-#define ENTRY_CHUNK (128 / sizeof(struct entry))
-
-struct database {
- struct group *groups;
- FILE *fh;
- char *fn;
- int dirty;
-};
-
-static int read_db(struct database *db);
-static FILE *open_file(char *fn, int wr);
-static void free_group(struct group *g);
-
-static void DBerror(char *fmt, ...)
-{
- va_list ap;
- va_start(ap,fmt);
- vfprintf(stderr, fmt, ap);
- va_end(ap);
- exit(1);
-}
-
-#define DB_NEW(p) ((p) = xalloc(sizeof(*(p))))
-
-static struct group *alloc_group(char *name)
-{
- struct group *g;
- DB_NEW(g);
- g->entries = xalloc(ENTRY_CHUNK * sizeof(struct entry));
- g->name = name;
- return g;
-}
-
-static char *cleanline(char *s)
-{
- char *p;
- while (isspace(*s))
- s++;
- if (*s == 0)
- return NULL;
- p = strchr(s, '\n');
- if (p)
- *p = 0;
- return s;
-}
-
-struct database *open_db(char *fn, int wr)
-{
- struct database *db;
-
- DB_NEW(db);
- db->fh = open_file(fn, wr);
- if (!db->fh) {
- DBerror("Cannot open database %s\n", fn);
- free(db);
- return NULL;
- }
- db->fn = xstrdup(fn);
- if (read_db(db) < 0) {
- free(db->fn);
- free(db);
- return NULL;
- }
- return db;
-}
-
-static int read_db(struct database *db)
-{
- char *line = NULL;
- size_t linesz = 0;
- struct group *group = NULL, **pgroup = &db->groups;
- int linenr = 0;
-
- while (getline(&line, &linesz, db->fh) > 0) {
- char *s;
- s = strchr(line, '#');
- if (s) {
- struct group *cmt;
- DB_NEW(cmt);
- *pgroup = cmt;
- pgroup = &cmt->next;
- cmt->comment = xstrdup(s + 1);
- *s = 0;
- }
- s = cleanline(line);
- linenr++;
- if (!s)
- continue;
- if (*s == '[') {
- int n;
- char *name;
- ++s;
- n = strcspn(s, "]");
- if (s[n] == 0)
- goto parse_error;
- name = xalloc(n + 1);
- memcpy(name, s, n);
- group = alloc_group(name);
- *pgroup = group;
- pgroup = &group->next;
- } else {
- char *p;
- if (!group)
- goto parse_error;
- p = s + strcspn(s, ":");
- if (*p != ':')
- goto parse_error;
- *p++ = 0;
- if (*p == ' ')
- p++;
- else
- goto parse_error;
- change_entry(db, group, line, p);
- }
- }
-
- if (ferror(db->fh)) {
- DBerror("IO error while reading database %s: %s\n", db->fn,
- strerror(errno));
- goto error;
- }
-
- free(line);
- return 0;
-
-parse_error:
- DBerror("Parse error in database %s at line %d\n", db->fn, linenr);
-error:
- free(line);
- return -1;
-}
-
-/*
-Crash safety strategy:
-
-While the database is opened hold a exclusive flock on the file
-When writing write to a temporary file (.out). Only when the file
-is written rename to another temporary file (.complete).
-
-Then sync and swap tmp file with main file, then sync directory
-(later is linux specific)
-
-During open if the main file doesn't exist and a .complete file does
-rename the .complete file to main first; or open the .complete
-file if the file system is read only.
-
-*/
-
-/* Flush directory. Useful on ext2, on journaling file systems
- the later fsync would usually force earlier transactions on the
- metadata too. */
-static int flush_dir(char *fn)
-{
- int err, fd;
- char *p;
- char dir[strlen(fn) + 1];
- strcpy(dir, fn);
- p = strrchr(dir, '/');
- if (p)
- *p = 0;
- else
- strcpy(dir, ".");
- fd = open(dir, O_DIRECTORY|O_RDONLY);
- if (fd < 0)
- return -1;
- err = 0;
- if (fsync(fd) < 0)
- err = -1;
- if (close(fd) < 0)
- err = -1;
- return err;
-}
-
-static int force_rename(char *a, char *b)
-{
- unlink(b); /* ignore error */
- return rename(a, b);
-}
-
-static int rewrite_db(struct database *db)
-{
- FILE *fhtmp;
- int err;
-
- int tmplen = strlen(db->fn) + 10;
- char fn_complete[tmplen], fn_old[tmplen], fn_out[tmplen];
-
- sprintf(fn_complete, "%s.complete", db->fn);
- sprintf(fn_old, "%s~", db->fn);
- sprintf(fn_out, "%s.out", db->fn);
-
- fhtmp = fopen(fn_out, "w");
- if (!fhtmp) {
- DBerror("Cannot open `%s' output file: %s\n", fn_out,
- strerror(errno));
- return -1;
- }
-
- dump_database(db, fhtmp);
-
- err = 0;
- /* Finish the output file */
- if (ferror(fhtmp) || fflush(fhtmp) != 0 || fsync(fileno(fhtmp)) != 0 ||
- fclose(fhtmp))
- err = -1;
- /* Rename to .complete */
- else if (force_rename(fn_out, fn_complete))
- err = -1;
- /* RED-PEN: need to do retry for race */
- /* Move to final name */
- else if (force_rename(db->fn, fn_old) || rename(fn_complete, db->fn))
- err = -1;
- /* Hit disk */
- else if (flush_dir(db->fn))
- err = -1;
-
- if (err) {
- DBerror("Error writing to database %s: %s\n", db->fn,
- strerror(errno));
- }
-
- return err;
-}
-
-int sync_db(struct database *db)
-{
- if (!db->dirty)
- return 0;
- /* RED-PEN window without lock */
- if (rewrite_db(db))
- return -1;
- fclose(db->fh);
- db->dirty = 0;
- db->fh = open_file(db->fn, 1);
- if (!db->fh)
- return -1;
- return 0;
-}
-
-static void free_group(struct group *g)
-{
- free(g->entries);
- free(g->name);
- free(g->comment);
- free(g);
-}
-
-static void free_data(struct database *db)
-{
- struct group *g, *gnext;
- for (g = db->groups; g; g = gnext) {
- gnext = g->next;
- free_group(g);
- }
-}
-
-int close_db(struct database *db)
-{
- if (db->dirty && rewrite_db(db))
- return -1;
- if (fclose(db->fh))
- return -1;
- free_data(db);
- free(db->fn);
- free(db);
- return 0;
-}
-
-static FILE *open_file(char *fn, int wr)
-{
- char tmp[strlen(fn) + 10];
- FILE *fh;
- if (access(fn, wr ? (R_OK|W_OK) : R_OK)) {
- switch (errno) {
- case EROFS:
- wr = 0;
- break;
- case ENOENT:
- /* No main DB file */
- sprintf(tmp, "%s.complete", fn);
- /* Handle race */
- if (!access(tmp, R_OK)) {
- if (rename(tmp, fn) < 0 && errno == EEXIST)
- return open_file(fn, wr);
- } else
- creat(fn, 0644);
- break;
- }
- }
- fh = fopen(fn, wr ? "r+" : "r");
- if (fh) {
- if (flock(fileno(fh), wr ? LOCK_EX : LOCK_SH) < 0) {
- fclose(fh);
- return NULL;
- }
- }
- return fh;
-}
-
-void dump_group(struct group *g, FILE *out)
-{
- struct entry *e;
- fprintf(out, "[%s]\n", g->name);
- for (e = &g->entries[0]; e->name && !ferror(out); e++)
- fprintf(out, "%s: %s\n", e->name, e->val);
-}
-
-void dump_database(struct database *db, FILE *out)
-{
- struct group *g;
- for (g = db->groups; g && !ferror(out); g = g->next) {
- if (g->comment) {
- fprintf(out, "#%s", g->comment);
- continue;
- }
- dump_group(g, out);
- }
-}
-
-struct group *find_group(struct database *db, char *name)
-{
- struct group *g;
- for (g = db->groups; g; g = g->next)
- if (g->name && !strcmp(g->name, name))
- return g;
- return NULL;
-}
-
-int delete_group(struct database *db, struct group *group)
-{
- struct group *g, **gprev;
- gprev = &db->groups;
- for (g = *gprev; g; gprev = &g->next, g = g->next) {
- if (g == group) {
- *gprev = g->next;
- free_group(g);
- return 0;
- }
- }
- db->dirty = 1;
- return -1;
-}
-
-char *entry_val(struct group *g, char *entry)
-{
- struct entry *e;
- for (e = &g->entries[0]; e->name; e++)
- if (!strcmp(e->name, entry))
- return e->val;
- return NULL;
-}
-
-struct group *add_group(struct database *db, char *name, int *existed)
-{
- struct group *g, **gprev = &db->groups;
- for (g = *gprev; g; gprev = &g->next, g = g->next)
- if (g->name && !strcmp(g->name, name))
- break;
- if (existed)
- *existed = (g != NULL);
- if (!g) {
- g = alloc_group(xstrdup(name));
- g->next = *gprev;
- *gprev = g;
- }
- db->dirty = 1;
- return g;
-
-}
-
-void change_entry(struct database *db, struct group *g,
- char *entry, char *newval)
-{
- int i;
- struct entry *e, *entries;
- db->dirty = 1;
- entries = &g->entries[0];
- for (e = entries; e->name; e++) {
- if (!strcmp(e->name, entry)) {
- free(e->val);
- e->val = xstrdup(newval);
- return;
- }
- }
- i = e - entries;
- assert(i == g->numentries);
- if (i > 0 && (i % ENTRY_CHUNK) == 0) {
- int new = (i + ENTRY_CHUNK) * sizeof(struct entry);
- g->entries = xrealloc(g->entries, new);
- }
- entries = &g->entries[0];
- e = &entries[i];
- e->name = xstrdup(entry);
- e->val = xstrdup(newval);
- g->numentries++;
-}
-
-void delete_entry(struct database *db, struct group *g, char *entry)
-{
- struct entry *e;
- for (e = &g->entries[0]; e->name; e++)
- if (!strcmp(e->name, entry))
- break;
- if (e->name == NULL)
- return;
- while ((++e)->name)
- e[-1] = e[0];
- g->numentries--;
-}
-
-struct group *
-clone_group(struct database *db, struct group *gold, char *newname)
-{
- struct entry *e;
- struct group *gnew = add_group(db, newname, NULL);
- for (e = &gold->entries[0]; e->name; e++)
- change_entry(db, gnew, e->name, e->val);
- return gnew;
-}
-
-static char *save_comment(char *c)
-{
- int len = strlen(c);
- char *s = xalloc(len + 2);
- strcpy(s, c);
- if (len == 0 || c[len - 1] != '\n')
- s[len] = '\n';
- return s;
-}
-
-void add_comment(struct database *db, struct group *group, char *comment)
-{
- struct group *g;
- struct group **gprev = &db->groups;
- for (g = *gprev; g; gprev = &g->next, g = g->next) {
- if ((group && g == group) || (!group && g->comment == NULL))
- break;
- }
- DB_NEW(g);
- g->comment = save_comment(comment);
- g->next = *gprev;
- *gprev = g;
- db->dirty = 1;
-}
-
-struct group *first_group(struct database *db)
-{
- return next_group(db->groups);
-}
-
-struct group *next_group(struct group *g)
-{
- struct group *n;
- if (!g)
- return NULL;
- n = g->next;
- while (n && n->comment)
- n = n->next;
- return n;
-}
-
-char *group_name(struct group *g)
-{
- return g->name;
-}
-
-struct group *find_entry(struct database *db, struct group *prev,
- char *entry, char *value)
-{
- int previ = 0;
- struct entry *e;
- struct group *g;
- if (prev)
- g = prev->next;
- else
- g = db->groups;
- for (; g; g = g->next) {
- if (g->comment)
- continue;
- /* Short cut when entry is at the same place as previous */
- if (previ < g->numentries) {
- e = &g->entries[previ];
- if (!strcmp(e->name, entry)) {
- if (!strcmp(e->val, value))
- return g;
- continue;
- }
- }
- for (e = &g->entries[0]; e->name; e++) {
- if (strcmp(e->name, entry))
- continue;
- if (!strcmp(e->val, value))
- return g;
- previ = e - &g->entries[0];
- break;
- }
- }
- return NULL;
-}
-
-void rename_group(struct database *db, struct group *g, char *newname)
-{
- free(g->name);
- g->name = xstrdup(newname);
- db->dirty = 1;
-}
-
-unsigned long entry_num(struct group *g, char *entry)
-{
- char *e = entry_val(g, entry);
- unsigned long val = 0;
- if (e)
- sscanf(e, "%lu", &val);
- return val;
-}
-
-void change_entry_num(struct database *db, struct group *g,
- char *entry, unsigned long val)
-{
- char buf[20];
- sprintf(buf, "%lu", val);
- change_entry(db, g, entry, buf);
-}
diff -urNp mcelog-d2e13bf0.orig/db.h mcelog-d2e13bf0/db.h
--- mcelog-d2e13bf0.orig/db.h 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/db.h 1969-12-31 19:00:00.000000000 -0500
@@ -1,29 +0,0 @@
-#include <stdio.h>
-struct database;
-struct group;
-
-struct database *open_db(char *fn, int wr);
-int sync_db(struct database *db);
-int close_db(struct database *db);
-struct group *find_group(struct database *db, char *name);
-char *entry_val(struct group *g, char *entry);
-struct group *add_group(struct database *db, char *name, int *existed);
-int delete_group(struct database *db, struct group *g);
-void change_entry(struct database *db, struct group *g,
- char *entry, char *newval);
-void add_comment(struct database *db, struct group *group, char *comment);
-struct group *first_group(struct database *db);
-struct group *next_group(struct group *g);
-void dump_group(struct group *g, FILE *out);
-void dump_database(struct database *db, FILE *out);
-struct group *find_entry(struct database *db, struct group *prev,
- char *entry, char *value);
-void rename_group(struct database *db, struct group *group, char *newname);
-char *group_name(struct group *g);
-unsigned long entry_num(struct group *g, char *entry);
-void change_entry_num(struct database *db, struct group *g, char *entry,
- unsigned long val);
-void delete_entry(struct database *db, struct group *g, char *entry);
-struct group *
-clone_group(struct database *db, struct group *gold, char *newname);
-
diff -urNp mcelog-d2e13bf0.orig/dbquery.c mcelog-d2e13bf0/dbquery.c
--- mcelog-d2e13bf0.orig/dbquery.c 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/dbquery.c 1969-12-31 19:00:00.000000000 -0500
@@ -1,130 +0,0 @@
-/* Access db files. This is for testing and debugging only. */
-#define _GNU_SOURCE 1
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <stdarg.h>
-#include "db.h"
-
-#define C(x) if (x) printf(#x " failed: %s\n", strerror(errno))
-#define NEEDGROUP if (!group) { printf("need group first\n"); break; }
-
-void Eprintf(char *fmt, ...)
-{
- va_list ap;
- va_start(ap, fmt);
- vfprintf(stderr, fmt, ap);
- va_end(ap);
-}
-
-void usage(void)
-{
- printf(
- "s sync\n"
- "q close/quit\n"
- "ggroup find group\n"
- "G delete group\n"
- "agroup add group\n"
- "ventry dump entry\n"
- "centry,val change entry to val\n"
- "fentry,val find entry with value and dump its group\n"
- "Ccomment add comment\n"
- "Lnewname clone group to newname\n"
- "d dump group\n"
- "D dump database\n");
-}
-
-int main(int ac, char **av)
-{
- struct database *db;
- struct group *group = NULL;
- char *line = NULL;
- size_t linesz = 0;
- if (!av[1]) {
- printf("%s database\n", av[0]);
- exit(1);
- }
- printf("dbtest\n");
- db = open_db(av[1], 1);
- while (printf("> "),
- fflush(stdout),
- getline(&line, &linesz, stdin) > 0) {
- char *p = line + strlen(line) - 1;
- while (p >= line && isspace(*p))
- *p-- = 0;
- switch (line[0]) {
- case 's':
- C(sync_db(db));
- break;
- case 'q':
- C(close_db(db));
- exit(0);
- case 'g':
- group = find_group(db, line + 1);
- if (group)
- printf("found\n");
- break;
- case 'G':
- NEEDGROUP;
- C(delete_group(db, group));
- group = NULL;
- break;
- case 'a': {
- int existed = 0;
- group = add_group(db, line + 1, &existed);
- if (existed)
- printf("existed\n");
- break;
- }
- case 'v':
- NEEDGROUP;
- printf("%s\n", entry_val(group, line + 1));
- break;
- case 'c': {
- p = line + 1;
- char *entry = strsep(&p, ",");
- NEEDGROUP;
- change_entry(db, group, entry, strsep(&p, ""));
- break;
- }
- case 'L':
- NEEDGROUP;
- clone_group(db, group, line + 1);
- break;
- case 'f': {
- struct group *g;
- p = line + 1;
- char *entry = strsep(&p, ",");
- char *val = strsep(&p, "");
- g = NULL;
- int nr = 0;
- while ((g = find_entry(db, g, entry, val)) != NULL) {
- if (nr == 0)
- group = g;
- nr++;
- dump_group(group, stdout);
- }
- if (nr == 0)
- printf("not found\n");
- break;
- }
- case 'C':
- NEEDGROUP;
- add_comment(db, group, line + 1);
- break;
- case 'd':
- NEEDGROUP;
- dump_group(group, stdout);
- break;
- case 'D':
- dump_database(db, stdout);
- break;
- default:
- usage();
- break;
- }
- }
- return 0;
-}
diff -urNp mcelog-d2e13bf0.orig/denverton.c mcelog-d2e13bf0/denverton.c
--- mcelog-d2e13bf0.orig/denverton.c 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/denverton.c 2016-11-30 11:24:12.204619369 -0500
@@ -0,0 +1,45 @@
+/* Copyright (C) 2016 Intel Corporation
+ Decode Intel Denverton specific machine check errors.
+
+ mcelog is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; version
+ 2.
+
+ mcelog is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should find a copy of v2 of the GNU General Public License somewhere
+ on your Linux system; if not, write to the Free Software Foundation,
+ Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: Tony Luck
+*/
+
+#include "mcelog.h"
+#include "bitfield.h"
+#include "denverton.h"
+#include "memdb.h"
+
+/* See IA32 SDM Vol3B Table 16-33 */
+
+static struct field mc_bits[] = {
+ SBITFIELD(16, "Cmd/Addr parity"),
+ SBITFIELD(17, "Corrected Demand/Patrol Scrub Error"),
+ SBITFIELD(18, "Uncorrected patrol scrub error"),
+ SBITFIELD(19, "Uncorrected demand read error"),
+ SBITFIELD(20, "WDB read ECC"),
+ {}
+};
+
+void denverton_decode_model(int cputype, int bank, u64 status, u64 misc)
+{
+ switch (bank) {
+ case 6: case 7:
+ Wprintf("MemCtrl: ");
+ decode_bitfield(status, mc_bits);
+ break;
+ }
+}
diff -urNp mcelog-d2e13bf0.orig/denverton.h mcelog-d2e13bf0/denverton.h
--- mcelog-d2e13bf0.orig/denverton.h 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/denverton.h 2016-11-30 11:24:12.204619369 -0500
@@ -0,0 +1 @@
+void denverton_decode_model(int cputype, int bank, u64 status, u64 misc);
diff -urNp mcelog-d2e13bf0.orig/diskdb.c mcelog-d2e13bf0/diskdb.c
--- mcelog-d2e13bf0.orig/diskdb.c 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/diskdb.c 1969-12-31 19:00:00.000000000 -0500
@@ -1,96 +0,0 @@
-/* High level interface to disk based DIMM database */
-/* Note: obsolete: new design is in memdb.c */
-#include <stdlib.h>
-#include <getopt.h>
-#include <stdio.h>
-#include "mcelog.h"
-#include "diskdb.h"
-#include "paths.h"
-#include "dimm.h"
-#include "dmi.h"
-
-char *error_trigger;
-unsigned error_thresh = 20;
-char *dimm_db_fn = DIMM_DB_FILENAME;
-
-static void checkdimmdb(void)
-{
- if (open_dimm_db(dimm_db_fn) < 0)
- exit(1);
-}
-
-int diskdb_modifier(int opt)
-{
- char *end;
-
- switch (opt) {
- case O_DATABASE:
- dimm_db_fn = optarg;
- checkdmi();
- checkdimmdb();
- break;
- case O_ERROR_TRIGGER:
- checkdmi();
- open_dimm_db(dimm_db_fn);
- error_thresh = strtoul(optarg, &end, 0);
- if (end == optarg || *end != ',')
- usage();
- error_trigger = end + 1;
- break;
- default:
- return 0;
- }
- return 1;
-}
-
-void diskdb_resolve_addr(u64 addr)
-{
- if (open_dimm_db(dimm_db_fn) >= 0)
- new_error(addr, error_thresh, error_trigger);
-}
-
-
-void diskdb_usage(void)
-{
- fprintf(stderr,
- "Manage disk DIMM error database\n"
- " mcelog [options] --drop-old-memory|--reset-memory locator\n"
- " mcelog --dump-memory locator\n"
- " old can be either locator or name\n"
- "Disk database options:"
- "--database fn Set filename of DIMM database (default " DIMM_DB_FILENAME ")\n"
- "--error-trigger cmd,thresh Run cmd on exceeding thresh errors per DIMM\n");
-}
-
-
-static void dimm_common(int ac, char **av)
-{
- no_syslog();
- checkdmi();
- checkdimmdb();
- argsleft(ac, av);
-}
-
-int diskdb_cmd(int opt, int ac, char **av)
-{
- char *arg = optarg;
-
- switch (opt) {
- case O_DUMP_MEMORY:
- dimm_common(ac, av);
- if (arg)
- dump_dimm(arg);
- else
- dump_all_dimms();
- return 1;
- case O_RESET_MEMORY:
- dimm_common(ac, av);
- reset_dimm(arg);
- return 1;
- case O_DROP_OLD_MEMORY:
- dimm_common(ac, av);
- gc_dimms();
- return 1;
- }
- return 0;
-}
diff -urNp mcelog-d2e13bf0.orig/diskdb.h mcelog-d2e13bf0/diskdb.h
--- mcelog-d2e13bf0.orig/diskdb.h 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/diskdb.h 1969-12-31 19:00:00.000000000 -0500
@@ -1,32 +0,0 @@
-
-#ifdef CONFIG_DISKDB
-enum diskdb_options {
- O_DATABASE = O_DISKDB,
- O_ERROR_TRIGGER,
- O_DUMP_MEMORY,
- O_RESET_MEMORY,
- O_DROP_OLD_MEMORY,
-};
-
-void diskdb_resolve_addr(u64 addr);
-int diskdb_modifier(int opt);
-int diskdb_cmd(int opt, int ac, char **av);
-void diskdb_usage(void);
-
-#define DISKDB_OPTIONS \
- { "database", 1, NULL, O_DATABASE }, \
- { "error-trigger", 1, NULL, O_ERROR_TRIGGER }, \
- { "dump-memory", 2, NULL, O_DUMP_MEMORY }, \
- { "reset-memory", 2, NULL, O_RESET_MEMORY }, \
- { "drop-old-memory", 0, NULL, O_DROP_OLD_MEMORY },
-
-#else
-
-static inline void diskdb_resolve_addr(u64 addr) {}
-static inline int diskdb_modifier(int opt) { return 0; }
-static inline int diskdb_cmd(int opt, int ac, char **av) { return 0; }
-static inline void diskdb_usage(void) {}
-
-#define DISKDB_OPTIONS
-
-#endif
diff -urNp mcelog-d2e13bf0.orig/dmi.h mcelog-d2e13bf0/dmi.h
--- mcelog-d2e13bf0.orig/dmi.h 2016-11-30 11:23:54.534909314 -0500
+++ mcelog-d2e13bf0/dmi.h 2016-11-30 11:24:12.205619409 -0500
@@ -3,7 +3,7 @@ struct dmi_entry {
unsigned char type;
unsigned char length;
unsigned short handle;
-};
+} __attribute__((packed));
enum {
DMI_MEMORY_ARRAY = 16,
diff -urNp mcelog-d2e13bf0.orig/.gitignore mcelog-d2e13bf0/.gitignore
--- mcelog-d2e13bf0.orig/.gitignore 2016-11-30 11:23:54.530909154 -0500
+++ mcelog-d2e13bf0/.gitignore 2016-11-30 11:24:12.202619289 -0500
@@ -8,3 +8,5 @@ dbquery
.depend
tsc
core
+version.c
+version.tmp
diff -urNp mcelog-d2e13bf0.orig/input/bdw_mirror1 mcelog-d2e13bf0/input/bdw_mirror1
--- mcelog-d2e13bf0.orig/input/bdw_mirror1 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/input/bdw_mirror1 2016-11-30 11:24:12.205619409 -0500
@@ -0,0 +1,6 @@
+# Broadwell mirror corrected with mirror failover
+CPU 0 7
+PROCESSOR 0:0x406f0
+STATUS 0x8800000000000080
+MISC 20000000000
+
diff -urNp mcelog-d2e13bf0.orig/input/bdw_mirror2 mcelog-d2e13bf0/input/bdw_mirror2
--- mcelog-d2e13bf0.orig/input/bdw_mirror2 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/input/bdw_mirror2 2016-11-30 11:24:12.205619409 -0500
@@ -0,0 +1,6 @@
+# Broadwell mirror corrected with successful scrub
+CPU 0 7
+PROCESSOR 0:0x406f0
+STATUS 0x8800000000000080
+MISC 40000000000
+
diff -urNp mcelog-d2e13bf0.orig/input/GENMEM mcelog-d2e13bf0/input/GENMEM
--- mcelog-d2e13bf0.orig/input/GENMEM 2016-11-30 11:23:54.532909234 -0500
+++ mcelog-d2e13bf0/input/GENMEM 2016-11-30 11:24:12.205619409 -0500
@@ -11,7 +11,7 @@ dimm=${3:-0}
corr_err_cnt=${4:-0}
if [ ! -z "$5" ] ; then
- ucflag=$[1 << (61-32)]
+ ucflag=$[(1 << (61-32)) | (1 << (60-32)) | (1 << (56-32))]
else
ucflag=0
fi
diff -urNp mcelog-d2e13bf0.orig/input/skx_mirror1 mcelog-d2e13bf0/input/skx_mirror1
--- mcelog-d2e13bf0.orig/input/skx_mirror1 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/input/skx_mirror1 2016-11-30 11:24:12.205619409 -0500
@@ -0,0 +1,6 @@
+# Skylake mirror corrected with mirror failover
+CPU 0 7
+PROCESSOR 0:0x50650
+STATUS 0x8800000000000080
+MISC 8000000000000000
+
diff -urNp mcelog-d2e13bf0.orig/input/skx_mirror2 mcelog-d2e13bf0/input/skx_mirror2
--- mcelog-d2e13bf0.orig/input/skx_mirror2 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/input/skx_mirror2 2016-11-30 11:24:12.205619409 -0500
@@ -0,0 +1,6 @@
+# Skylake mirror corrected with successful scrub
+CPU 0 7
+PROCESSOR 0:0x50650
+STATUS 0x8800000000000080
+MISC 4000000000000000
+
diff -urNp mcelog-d2e13bf0.orig/intel.c mcelog-d2e13bf0/intel.c
--- mcelog-d2e13bf0.orig/intel.c 2016-11-30 11:23:54.538909475 -0500
+++ mcelog-d2e13bf0/intel.c 2016-11-30 11:24:12.206619450 -0500
@@ -25,7 +25,6 @@
#include "sandy-bridge.h"
#include "ivy-bridge.h"
#include "haswell.h"
-#include "xeon75xx.h"
int memory_error_support;
@@ -36,7 +35,9 @@ void intel_cpu_init(enum cputype cpu)
cpu == CPU_IVY_BRIDGE || cpu == CPU_IVY_BRIDGE_EPEX ||
cpu == CPU_HASWELL || cpu == CPU_HASWELL_EPEX || cpu == CPU_BROADWELL ||
cpu == CPU_BROADWELL_DE || cpu == CPU_BROADWELL_EPEX ||
- cpu == CPU_KNIGHTS_LANDING || cpu == CPU_SKYLAKE || cpu == CPU_SKYLAKE_XEON)
+ cpu == CPU_KNIGHTS_LANDING || cpu == CPU_KNIGHTS_MILL ||
+ cpu == CPU_SKYLAKE || cpu == CPU_SKYLAKE_XEON ||
+ cpu == CPU_KABYLAKE || cpu == CPU_DENVERTON)
memory_error_support = 1;
}
@@ -82,6 +83,8 @@ enum cputype select_intel_cputype(int fa
return CPU_BROADWELL_DE;
else if (model == 0x57)
return CPU_KNIGHTS_LANDING;
+ else if (model == 0x85)
+ return CPU_KNIGHTS_MILL;
else if (model == 0x1c || model == 0x26 || model == 0x27 ||
model == 0x35 || model == 0x36 || model == 0x36 ||
model == 0x37 || model == 0x4a || model == 0x4c ||
@@ -91,18 +94,22 @@ enum cputype select_intel_cputype(int fa
return CPU_SKYLAKE;
else if (model == 0x55)
return CPU_SKYLAKE_XEON;
+ else if (model == 0x8E || model == 0x9E)
+ return CPU_KABYLAKE;
+ else if (model == 0x5f)
+ return CPU_DENVERTON;
if (model > 0x1a) {
- Eprintf("Family 6 Model %x CPU: only decoding architectural errors\n",
+ Eprintf("Family 6 Model %u CPU: only decoding architectural errors\n",
model);
return CPU_INTEL;
}
}
if (family > 6) {
- Eprintf("Family %u Model %x CPU: only decoding architectural errors\n",
+ Eprintf("Family %u Model %u CPU: only decoding architectural errors\n",
family, model);
return CPU_INTEL;
}
- Eprintf("Unknown Intel CPU type family %x model %x\n", family, model);
+ Eprintf("Unknown Intel CPU type family %u model %u\n", family, model);
return family == 6 ? CPU_P6OLD : CPU_GENERIC;
}
@@ -127,9 +134,6 @@ static int intel_memory_error(struct mce
case CPU_NEHALEM:
nehalem_memerr_misc(m, channel, dimm);
break;
- case CPU_XEON75XX:
- xeon75xx_memory_error(m, recordlen, channel, dimm);
- break;
case CPU_SANDY_BRIDGE_EP:
sandy_bridge_ep_memerr_misc(m, channel, dimm);
break;
diff -urNp mcelog-d2e13bf0.orig/intel.h mcelog-d2e13bf0/intel.h
--- mcelog-d2e13bf0.orig/intel.h 2016-11-30 11:23:54.530909154 -0500
+++ mcelog-d2e13bf0/intel.h 2016-11-30 11:24:12.206619450 -0500
@@ -25,6 +25,9 @@ extern int memory_error_support;
case CPU_BROADWELL_EPEX: \
case CPU_ATOM: \
case CPU_KNIGHTS_LANDING: \
+ case CPU_KNIGHTS_MILL: \
case CPU_SKYLAKE: \
- case CPU_SKYLAKE_XEON
+ case CPU_SKYLAKE_XEON: \
+ case CPU_KABYLAKE: \
+ case CPU_DENVERTON
diff -urNp mcelog-d2e13bf0.orig/leaky-bucket.c mcelog-d2e13bf0/leaky-bucket.c
--- mcelog-d2e13bf0.orig/leaky-bucket.c 2016-11-30 11:23:54.537909435 -0500
+++ mcelog-d2e13bf0/leaky-bucket.c 2016-11-30 11:24:12.206619450 -0500
@@ -72,7 +72,9 @@ static int timeconv(char unit, int *out)
case 'h': corr *= 60;
case 'm': corr *= 60;
case 0: break;
- default: return -1;
+ default:
+ *out = 1;
+ return -1;
}
*out = corr;
return 0;
diff -urNp mcelog-d2e13bf0.orig/Makefile mcelog-d2e13bf0/Makefile
--- mcelog-d2e13bf0.orig/Makefile 2016-11-30 11:23:54.538909475 -0500
+++ mcelog-d2e13bf0/Makefile 2016-11-30 11:24:12.202619289 -0500
@@ -17,11 +17,6 @@ WARNINGS := -Wall -Wextra -Wno-missing-f
-Wstrict-prototypes -Wformat-security -Wmissing-declarations \
-Wdeclaration-after-statement
-# The on disk database has still many problems (partly in this code and partly
-# due to missing support from BIOS), so it's disabled by default. You can
-# enable it here by uncommenting the following line
-# CONFIG_DISKDB = 1
-
TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \
socket-memory-error-trigger \
bus-error-trigger \
@@ -36,23 +31,16 @@ OBJ := p4.o k8.o mcelog.o dmi.o tsc.o co
nehalem.o dunnington.o tulsa.o config.o memutil.o msg.o \
eventloop.o leaky-bucket.o memdb.o server.o trigger.o \
client.o cache.o sysfs.o yellow.o page.o rbtree.o \
- xeon75xx.o sandy-bridge.o ivy-bridge.o haswell.o \
+ sandy-bridge.o ivy-bridge.o haswell.o \
broadwell_de.o broadwell_epex.o skylake_xeon.o \
+ denverton.o \
msr.o bus.o unknown.o
-DISKDB_OBJ := diskdb.o dimm.o db.o
-CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o ${DISKDB_OBJ} \
+CLEAN := mcelog dmi tsc dbquery .depend .depend.X dbquery.o \
version.o version.c version.tmp
DOC := mce.pdf
ADD_DEFINES :=
-ifdef CONFIG_DISKDB
-ADD_DEFINES := -DCONFIG_DISKDB=1
-OBJ += ${DISKDB_OBJ}
-
-all: dbquery
-endif
-
SRC := $(OBJ:.o=.c)
mcelog: ${OBJ} version.o
diff -urNp mcelog-d2e13bf0.orig/mcelog.c mcelog-d2e13bf0/mcelog.c
--- mcelog-d2e13bf0.orig/mcelog.c 2016-11-30 11:23:54.531909194 -0500
+++ mcelog-d2e13bf0/mcelog.c 2016-11-30 11:25:24.563516902 -0500
@@ -48,7 +48,6 @@
#include "tsc.h"
#include "version.h"
#include "config.h"
-#include "diskdb.h"
#include "memutil.h"
#include "eventloop.h"
#include "memdb.h"
@@ -236,9 +235,12 @@ static char *cputype_name[] = {
[CPU_BROADWELL_DE] = "Intel Xeon (Broadwell) D family",
[CPU_BROADWELL_EPEX] = "Intel Xeon v4 (Broadwell) EP/EX",
[CPU_KNIGHTS_LANDING] = "Knights Landing",
+ [CPU_KNIGHTS_MILL] = "Knights Mill",
[CPU_ATOM] = "ATOM",
[CPU_SKYLAKE] = "Skylake",
[CPU_SKYLAKE_XEON] = "Skylake server",
+ [CPU_KABYLAKE] = "Kabylake",
+ [CPU_DENVERTON] = "Denverton",
};
static struct config_choice cpu_choices[] = {
@@ -282,10 +284,13 @@ static struct config_choice cpu_choices[
{ "broadwell-ep", CPU_BROADWELL_EPEX },
{ "broadwell-ex", CPU_BROADWELL_EPEX },
{ "knightslanding", CPU_KNIGHTS_LANDING },
+ { "knightsmill", CPU_KNIGHTS_MILL },
{ "xeon-v4", CPU_BROADWELL_EPEX },
{ "atom", CPU_ATOM },
{ "skylake", CPU_SKYLAKE },
{ "skylake_server", CPU_SKYLAKE_XEON },
+ { "kabylake", CPU_KABYLAKE },
+ { "denverton", CPU_DENVERTON },
{ NULL }
};
@@ -356,7 +361,7 @@ static enum cputype setup_cpuid(u32 cpuv
return CPU_K8;
/* FALL THROUGH */
default:
- Eprintf("Unknown CPU type vendor %u family %x model %x",
+ Eprintf("Unknown CPU type vendor %u family %u model %u",
cpuvendor, family, model);
return CPU_GENERIC;
}
@@ -449,12 +454,10 @@ static void dump_mce(struct mce *m, unsi
if (cputype != CPU_SANDY_BRIDGE_EP && cputype != CPU_IVY_BRIDGE_EPEX &&
cputype != CPU_HASWELL_EPEX && cputype != CPU_BROADWELL &&
cputype != CPU_BROADWELL_DE && cputype != CPU_BROADWELL_EPEX &&
- cputype != CPU_KNIGHTS_LANDING && cputype != CPU_SKYLAKE &&
- cputype != CPU_SKYLAKE_XEON)
+ cputype != CPU_KNIGHTS_LANDING && cputype != CPU_KNIGHTS_MILL &&
+ cputype != CPU_SKYLAKE && cputype != CPU_SKYLAKE_XEON &&
+ cputype != CPU_KABYLAKE && cputype != CPU_DENVERTON)
resolveaddr(m->addr);
- if (!ascii_mode && ismemerr && (m->status & MCI_STATUS_ADDRV)) {
- diskdb_resolve_addr(m->addr);
- }
}
static void dump_mce_raw_ascii(struct mce *m, unsigned recordlen)
@@ -889,6 +892,7 @@ static void remove_pidfile(void)
static void signal_exit(int sig)
{
remove_pidfile();
+ client_cleanup();
_exit(sig);
}
@@ -974,7 +978,6 @@ void usage(void)
"--no-imc-log Disable extended iMC logging\n"
"--is-cpu-supported Exit with return code indicating whether the CPU is supported\n"
);
- diskdb_usage();
printf("\n");
print_cputypes();
exit(1);
@@ -1043,7 +1046,6 @@ static struct option options[] = {
{ "debug-numerrors", 0, NULL, O_DEBUG_NUMERRORS }, /* undocumented: for testing */
{ "no-imc-log", 0, NULL, O_NO_IMC_LOG },
{ "is-cpu-supported", 0, NULL, O_IS_CPU_SUPPORTED },
- DISKDB_OPTIONS
{}
};
@@ -1191,8 +1193,6 @@ void no_syslog(void)
static int combined_modifier(int opt)
{
int r = modifier(opt);
- if (r == 0)
- r = diskdb_modifier(opt);
return r;
}
@@ -1369,8 +1369,6 @@ int main(int ac, char **av)
noargs(ac, av);
fprintf(stderr, "mcelog %s\n", MCELOG_VERSION);
exit(0);
- } else if (diskdb_cmd(opt, ac, av)) {
- exit(0);
} else if (opt == 0)
break;
}
diff -urNp mcelog-d2e13bf0.orig/mcelog.h mcelog-d2e13bf0/mcelog.h
--- mcelog-d2e13bf0.orig/mcelog.h 2016-11-30 11:23:54.539909515 -0500
+++ mcelog-d2e13bf0/mcelog.h 2016-11-30 11:24:12.207619490 -0500
@@ -127,9 +127,12 @@ enum cputype {
CPU_BROADWELL_DE,
CPU_BROADWELL_EPEX,
CPU_KNIGHTS_LANDING,
+ CPU_KNIGHTS_MILL,
CPU_ATOM,
CPU_SKYLAKE,
CPU_SKYLAKE_XEON,
+ CPU_KABYLAKE,
+ CPU_DENVERTON,
};
enum option_ranges {
diff -urNp mcelog-d2e13bf0.orig/mcelog.service mcelog-d2e13bf0/mcelog.service
--- mcelog-d2e13bf0.orig/mcelog.service 2016-11-30 11:23:54.540909556 -0500
+++ mcelog-d2e13bf0/mcelog.service 2016-11-30 11:24:12.207619490 -0500
@@ -5,6 +5,7 @@ After=syslog.target
[Service]
ExecStart=/usr/sbin/mcelog --ignorenodev --daemon --foreground
StandardOutput=syslog
+SuccessExitStatus=0 15
[Install]
WantedBy=multi-user.target
diff -urNp mcelog-d2e13bf0.orig/msr.c mcelog-d2e13bf0/msr.c
--- mcelog-d2e13bf0.orig/msr.c 2016-11-30 11:23:54.538909475 -0500
+++ mcelog-d2e13bf0/msr.c 2016-11-30 11:24:12.207619490 -0500
@@ -25,19 +25,20 @@ static void domsr(int cpu, int msr, int
}
if (pread(fd, &data, sizeof data, msr) != sizeof data) {
SYSERRprintf("Cannot read MSR_ERROR_CONTROL from %s\n", fpath);
- return;
+ goto out;
}
data |= bit;
if (pwrite(fd, &data, sizeof data, msr) != sizeof data) {
SYSERRprintf("Cannot write MSR_ERROR_CONTROL to %s\n", fpath);
- return;
+ goto out;
}
if (pread(fd, &data, sizeof data, msr) != sizeof data) {
SYSERRprintf("Cannot re-read MSR_ERROR_CONTROL from %s\n", fpath);
- return;
+ goto out;
}
if ((data & bit) == 0)
Lprintf("No DIMM detection available on cpu %d (normal in virtual environments)\n", cpu);
+out:
close(fd);
}
diff -urNp mcelog-d2e13bf0.orig/nehalem.c mcelog-d2e13bf0/nehalem.c
--- mcelog-d2e13bf0.orig/nehalem.c 2016-11-30 11:23:54.537909435 -0500
+++ mcelog-d2e13bf0/nehalem.c 2016-11-30 11:24:12.207619490 -0500
@@ -24,7 +24,6 @@
#include "nehalem.h"
#include "bitfield.h"
#include "memdb.h"
-#include "xeon75xx.h"
/* See IA32 SDM Vol3B Appendix E.3.2 ff */
@@ -130,7 +129,8 @@ void decode_memory_controller(u32 status
if ((status & 0xf) == 0xf)
strcpy(channel, "unspecified");
else {
- if (cputype == CPU_KNIGHTS_LANDING) /* Fix for Knights Landing MIC */
+ /* Fix for Knights Landing/Mill MIC */
+ if (cputype == CPU_KNIGHTS_LANDING || cputype == CPU_KNIGHTS_MILL)
sprintf(channel, "%u", (status & 0xf) + 3 * (bank == 15));
else
sprintf(channel, "%u", status & 0xf);
@@ -170,7 +170,6 @@ void xeon75xx_decode_model(struct mce *m
decode_bitfield(status, internal_error_status);
decode_numfield(status, internal_error_numbers);
}
- xeon75xx_decode_dimm(m, msize);
}
/* Nehalem-EP specific DIMM decoding */
diff -urNp mcelog-d2e13bf0.orig/p4.c mcelog-d2e13bf0/p4.c
--- mcelog-d2e13bf0.orig/p4.c 2016-11-30 11:23:54.534909314 -0500
+++ mcelog-d2e13bf0/p4.c 2016-11-30 11:24:12.208619530 -0500
@@ -39,6 +39,7 @@
#include "broadwell_de.h"
#include "broadwell_epex.h"
#include "skylake_xeon.h"
+#include "denverton.h"
/* decode mce for P4/Xeon and Core2 family */
@@ -289,10 +290,29 @@ static const char *arstate[4] = {
[3] = "SRAR"
};
+static const char *ce_types[] = {
+ [0] = "ecc",
+ [1] = "mirroring with channel failover",
+ [2] = "mirroring. Primary channel scrubbed successfully"
+};
+
+static int check_for_mirror(__u8 bank, __u64 status, __u64 misc)
+{
+ switch (cputype) {
+ case CPU_BROADWELL_EPEX:
+ return bdw_epex_ce_type(bank, status, misc);
+ case CPU_SKYLAKE_XEON:
+ return skylake_s_ce_type(bank, status, misc);
+ default:
+ return 0;
+ }
+}
+
static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr,
int socket, __u8 bank)
{
u64 track = 0;
+ int i;
Wprintf("MCi status:\n");
if (!(status & MCI_STATUS_VAL))
@@ -303,6 +323,8 @@ static int decode_mci(__u64 status, __u6
if (status & MCI_STATUS_UC)
Wprintf("Uncorrected error\n");
+ else if ((i = check_for_mirror(bank, status, misc)))
+ Wprintf("Corrected error by %s\n", ce_types[i]);
else
Wprintf("Corrected error\n");
@@ -428,6 +450,9 @@ void decode_intel_mc(struct mce *log, in
case CPU_SKYLAKE_XEON:
skylake_s_decode_model(cputype, log->bank, log->status, log->misc);
break;
+ case CPU_DENVERTON:
+ denverton_decode_model(cputype, log->bank, log->status, log->misc);
+ break;
}
}
diff -urNp mcelog-d2e13bf0.orig/README mcelog-d2e13bf0/README
--- mcelog-d2e13bf0.orig/README 2016-11-30 11:23:54.538909475 -0500
+++ mcelog-d2e13bf0/README 1969-12-31 19:00:00.000000000 -0500
@@ -1,119 +0,0 @@
-mcelog is the user space backend for logging machine check errors
-reported by the hardware to the kernel. The kernel does the immediate
-actions (like killing processes etc.) and mcelog decodes the errors
-and manages various other advanced error responses like
-offlining memory, CPUs or triggering events. In addition
-mcelog also handles corrected errors, by logging and accounting them.
-
-It primarily handles machine checks and thermal events, which
-are reported for errors detected by the CPU.
-
-For more details on what mcelog can do and the underlying theory
-see http://www.mcelog.org
-
-It is recommended that mcelog runs on all x86 machines, both
-64bit (since early 2.6) and 32bit (since 2.6.32)
-
-mcelog can run in several modi: cronjob, trigger, daemon
-
-cronjob is the old method. mcelog runs every 5 minutes from cron and checks
-for errors. Disadvantage of this is that it can delay error reporting
-significantly (upto 10 minutes) and does not allow mcelog to keep extended state.
-
-trigger is a newer method where the kernel runs mcelog on a error.
-This is configured with
-echo /usr/sbin/mcelog > /sys/devices/system/machinecheck/machinecheck0/trigger
-This is faster, but still doesn't allow mcelog to keep state,
-and has relatively high overhead for each error because a program has
-to be initialized from scratch.
-
-In daemon mode mcelog runs continuously as a daemon in the background
-and wait for errors. It is enabled by running mcelog --daemon &
-from a init script. This is the fastest and most feature-ful.
-
-The recommended mode is daemon, because several new functions (like page error
-predictive failure analysis) require a continuously running daemon.
-
-Documentation:
-
-The primary reference documentation are the man pages.
-lk10-mcelog.pdf has a overview over the errors mcelog handles
-(originally from Linux Kongress 2010)
-mce.pdf is a very old paper describing the first releases of mcelog
-(some parts are obsolete)
-
-For distributors:
-
-You can run mcelog from systemd or similar daemons. An example
-systemd unit file is in mcelog.service.
-
-For older distributions using init scripts:
-
-Please install a init script by default that runs mcelog in daemon mode.
-The mcelog.init script is a good starting point.
-
-Also install a logrotated file (mcelog.logrotate) or equivalent
-when mcelog is running in daemon mode.
-
-These two are not in make install.
-
-The installation also requires a config file (/etc/mcelog.conf) and
-the default triggers. These are all installed by "make install"
-
-/dev/mcelog is needed for mcelog operation
-If it's not there it can be created with mknod /dev/mcelog c 10 227
-Normally it should be created automatically in udev.
-
-Security:
-
-mcelog needs to run as root because it might trigger actions like
-page-offlining, which require CAP_SYS_ADMIN. Also it opens /dev/mcelog
-and a unix socket for client support.
-
-It also opens /dev/mem to parse the BIOS DMI tables. It is careful
-to close the file descriptor and unmap any mappings after using them.
-
-There is support for changing the user in daemon mode after opening
-the device and the sockets, but that would stop triggers from
-doing corrective action that require root.
-
-In principle it would be possible to only keep CAP_SYS_ADMIN
-for page-offling, but that would prevent triggers from doing root
-only actions not covered by it (and CAP_SYS_ADMIN is not that different
-from full root)
-
-In daemon mode mcelog listens to a unix socket and processes
-requests from mcelog --client. This can be disabled in the configuration file.
-The uid/gid of the requestor is checked on access and is configurable
-(default 0/0 only). The command parsing code is very straight forward
-(server.c) The client parsing/reply is currently done with full privileges
-of the daemon.
-
-Testing:
-
-There is a simple test suite in tests/. The test suite requires root to
-run and access to mce-inject and a kernel with MCE injection support
-(CONFIG_X86_MCE_INJECT). It will kill any running mcelog daemon.
-
-Run it with "make test"
-
-The test suite requires the mce-inject tool, available from
-git://git.kernel.org/pub/utils/cpu/mce/mce-inject.git
-The mce-inject executable must be either in $PATH or in the
-../mce-inject directory.
-
-You can also test under valgrind with "make valgrind-test". For
-this valgrind needs to be installed of course. Advanced
-valgrind options can be specified with
-make VALGRIND="valgrind --option" valgrind-test
-
-Other checks:
-
-make iccverify and make clangverify run the static verifiers
-in clang and icc respectively.
-
-License:
-
-This program is licensed under the subject of the GNU Public General
-License, v.2
-
diff -urNp mcelog-d2e13bf0.orig/README.md mcelog-d2e13bf0/README.md
--- mcelog-d2e13bf0.orig/README.md 1969-12-31 19:00:00.000000000 -0500
+++ mcelog-d2e13bf0/README.md 2016-11-30 11:24:12.202619289 -0500
@@ -0,0 +1,129 @@
+# mcelog
+
+mcelog is the user space backend for logging machine check errors reported
+by the hardware to the kernel. The kernel does the immediate actions
+(like killing processes etc.) and mcelog decodes the errors and manages
+various other advanced error responses like offlining memory, CPUs or triggering
+events. In addition mcelog also handles corrected errors, by logging and
+accounting them.
+It primarily handles machine checks and thermal events, which are reported
+for errors detected by the CPU.
+
+For more details on what mcelog can do and the underlying theory
+see [mcelog.org](http://www.mcelog.org).
+
+It is recommended that mcelog runs on all x86 machines, both 64bit
+(since early 2.6) and 32bit (since 2.6.32).
+
+mcelog can run in several modes:
+
+- cronjob
+- trigger
+- daemon
+
+**cronjob** is the old method. mcelog runs every 5 minutes from cron and checks
+for errors. Disadvantage of this is that it can delay error reporting
+significantly (upto 10 minutes) and does not allow mcelog to keep extended state.
+
+**trigger** is a newer method where the kernel runs mcelog on a error.
+
+This is configured with:
+```sh
+echo /usr/sbin/mcelog > /sys/devices/system/machinecheck/machinecheck0/trigger
+```
+This is faster, but still doesn't allow mcelog to keep state,
+and has relatively high overhead for each error because a program has
+to be initialized from scratch.
+
+In **daemon** mode mcelog runs continuously as a daemon in the background and
+wait for errors. It is enabled by running `mcelog --daemon &`
+from a init script. This is the fastest and most feature-ful.
+
+The recommended mode is **daemon**, because several new functions (like page
+error predictive failure analysis) require a continuously running daemon.
+
+## Documentation
+
+- The primary reference documentation are the man pages.
+- [lk10-mcelog.pdf](https://github.com/andikleen/mcelog/blob/master/lk10-mcelog.pdf)
+ has a overview over the errors mcelog handles (originally from Linux Kongress 2010).
+- [mce.pdf](https://github.com/mjtrangoni/mcelog/blob/README.md/mce.pdf)
+ is a very old paper describing the first releases of mcelog (some parts are obsolete).
+
+## For distributors
+
+You can run mcelog from systemd or similar daemons. An example systemd unit
+file is in `mcelog.service`.
+
+### For older distributions using init scripts
+
+Please install an init script by default that runs mcelog in daemon mode.
+The `mcelog.init` script is a good starting point. Also install a
+logrotated file (mcelog.logrotate) or equivalent when mcelog is running
+in daemon mode.
+These two are not in make install.
+
+The installation also requires a config file `/etc/mcelog.conf` and the default
+triggers. These are all installed by `make install`
+
+`/dev/mcelog` is needed for mcelog operation. If it's not there it can be
+created with:
+```sh
+mknod /dev/mcelog c 10 227
+```
+
+Normally it should be created automatically in udev.
+
+## Security
+
+mcelog needs to run as root because it might trigger actions like
+page-offlining, which require `CAP_SYS_ADMIN`. Also it opens `/dev/mcelog`
+and an UNIX socket for client support.
+
+It also opens `/dev/mem` to parse the BIOS DMI tables. It is careful to close
+the file descriptor and unmap any mappings after using them.
+
+There is support for changing the user in daemon mode after opening the device
+and the sockets, but that would stop triggers from doing corrective action
+that require `root`.
+
+In principle it would be possible to only keep `CAP_SYS_ADMIN` for page-offling,
+but that would prevent triggers from doing root-only actions not covered by
+it (and `CAP_SYS_ADMIN` is not that different from full root)
+
+In `daemon` mode mcelog listens to a UNIX socket and processes requests from
+`sh mcelog --client`. This can be disabled in the configuration file.
+The uid/gid of the requestor is checked on access and is configurable
+(default 0/0 only). The command parsing code is very straight forward
+(server.c). The client parsing/reply is currently done with full privileges
+of the `daemon`.
+
+## Testing
+
+There is a simple test suite in `sh tests/`. The test suite requires root to
+run and access to mce-inject and a kernel with MCE injection support
+`CONFIG_X86_MCE_INJECT`. It will kill any running mcelog daemon.
+
+Run it with `sh make test`.
+
+The test suite requires the
+[mce-inject](git://git.kernel.org/pub/utils/cpu/mce/mce-inject.git) tool.
+The `mce-inject` executable must be either in `$PATH` or in the
+`../mce-inject` directory.
+
+You can also test under **valgrind** with `sh make valgrind-test`. For this
+valgrind needs to be installed of course. Advanced valgrind options can be
+specified with:
+```sh
+make VALGRIND="valgrind --option" valgrind-test
+```
+
+### Other checks
+
+`make iccverify` and `make clangverify` run the static verifiers in *clang*
+and *icc* respectively.
+
+## License
+
+This program is licensed under the subject of the GNU Public General
+License, v.2
diff -urNp mcelog-d2e13bf0.orig/skylake_xeon.c mcelog-d2e13bf0/skylake_xeon.c
--- mcelog-d2e13bf0.orig/skylake_xeon.c 2016-11-30 11:23:54.538909475 -0500
+++ mcelog-d2e13bf0/skylake_xeon.c 2016-11-30 11:24:12.208619530 -0500
@@ -23,6 +23,11 @@
#include "skylake_xeon.h"
#include "memdb.h"
+/* Memory error was corrected by mirroring with channel failover */
+#define SKX_MCI_MISC_FO (1ULL<<63)
+/* Memory error was corrected by mirroring and primary channel scrubbed successfully */
+#define SKX_MCI_MISC_MC (1ULL<<62)
+
/* See IA32 SDM Vol3B Table 16-27 */
static char *pcu_1[] = {
@@ -208,3 +213,18 @@ void skylake_s_decode_model(int cputype,
break;
}
}
+
+int skylake_s_ce_type(int bank, u64 status, u64 misc)
+{
+ if (!(bank == 7 || bank == 8))
+ return 0;
+
+ if (status & MCI_STATUS_MISCV) {
+ if (misc & SKX_MCI_MISC_FO)
+ return 1;
+ if (misc & SKX_MCI_MISC_MC)
+ return 2;
+ }
+
+ return 0;
+}
diff -urNp mcelog-d2e13bf0.orig/skylake_xeon.h mcelog-d2e13bf0/skylake_xeon.h
--- mcelog-d2e13bf0.orig/skylake_xeon.h 2016-11-30 11:23:54.539909515 -0500
+++ mcelog-d2e13bf0/skylake_xeon.h 2016-11-30 11:24:12.208619530 -0500
@@ -1 +1,2 @@
void skylake_s_decode_model(int cputype, int bank, u64 status, u64 misc);
+int skylake_s_ce_type(int bank, u64 status, u64 misc);
diff -urNp mcelog-d2e13bf0.orig/sysfs.c mcelog-d2e13bf0/sysfs.c
--- mcelog-d2e13bf0.orig/sysfs.c 2016-11-30 11:23:54.534909314 -0500
+++ mcelog-d2e13bf0/sysfs.c 2016-11-30 11:24:12.208619530 -0500
@@ -37,10 +37,10 @@ char *read_field(char *base, char *name)
asprintf(&fn, "%s/%s", base, name);
fd = open(fn, O_RDONLY);
+ free(fn);
if (fstat(fd, &st) < 0)
goto bad;
buf = xalloc(st.st_size);
- free(fn);
if (fd < 0)
goto bad;
n = read(fd, buf, st.st_size);
@@ -81,10 +81,12 @@ unsigned read_field_map(char *base, char
if (!strcmp(val, map->name))
break;
}
- free(val);
- if (map->name)
+ if (map->name) {
+ free(val);
return map->value;
+ }
Eprintf("sysfs field %s/%s has unknown string value `%s'\n", base, name, val);
+ free(val);
return -1;
}
diff -urNp mcelog-d2e13bf0.orig/TODO-diskdb mcelog-d2e13bf0/TODO-diskdb
--- mcelog-d2e13bf0.orig/TODO-diskdb 2016-11-30 11:23:54.530909154 -0500
+++ mcelog-d2e13bf0/TODO-diskdb 1969-12-31 19:00:00.000000000 -0500
@@ -1,31 +0,0 @@
-
-diskdb was a experimental attempt to track errors per DIMM
-on disk. It ran into problems unfortunately.
-
-diskdb is not compiled by default now. It can be enabled with
-make CONFIG_DISKDB=1
-
-It is replaced with a new memory only database now that
-relies on daemon mode.
-
-Open fundamental issues:
-- DIMM tracking over boot doesn't work due to SMBIOS not reporting
-serial numbers
-
-Code problems:
-- Missing aging
-- For Intel Nehalem CE errors need reverse smbios translation
-- SMBIOS interleaving decoding missing
-- Some crash races in db.c (see comments there)
-- Need lock timeout
-- Default enable/disable heuristics (smbios check etc.)
-- write db test suite (with crash)
-
-General:
-- Missing CPU database
-
-Missing:
-- rename to different name without memory
-
-Old:
-- add ifdef for memory because it's broken
diff -urNp mcelog-d2e13bf0.orig/xeon75xx.c mcelog-d2e13bf0/xeon75xx.c
--- mcelog-d2e13bf0.orig/xeon75xx.c 2016-11-30 11:23:54.537909435 -0500
+++ mcelog-d2e13bf0/xeon75xx.c 1969-12-31 19:00:00.000000000 -0500
@@ -1,39 +0,0 @@
-/* Copyright (C) 2009/2010 Intel Corporation
-
- Decode Intel Xeon75xx memory errors. Requires the mce-75xx.ko driver
- load. The core errors are the same as Nehalem.
-
- mcelog is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; version
- 2.
-
- mcelog is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should find a copy of v2 of the GNU General Public License somewhere
- on your Linux system; if not, write to the Free Software Foundation,
- Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
- Author: Andi Kleen
-*/
-
-#include <stdio.h>
-#include <stddef.h>
-#include "mcelog.h"
-#include "xeon75xx.h"
-
-/* This used to decode the old xeon 75xx memory error aux format. But that has never
- been merged into mainline kernels, so removed it again. */
-
-void
-xeon75xx_memory_error(struct mce *m, unsigned msize, int *channel, int *dimm)
-{
-}
-
-
-void xeon75xx_decode_dimm(struct mce *m, unsigned msize)
-{
-}
diff -urNp mcelog-d2e13bf0.orig/xeon75xx.h mcelog-d2e13bf0/xeon75xx.h
--- mcelog-d2e13bf0.orig/xeon75xx.h 2016-11-30 11:23:54.537909435 -0500
+++ mcelog-d2e13bf0/xeon75xx.h 1969-12-31 19:00:00.000000000 -0500
@@ -1,2 +0,0 @@
-void xeon75xx_memory_error(struct mce *m, unsigned msize, int *channel, int *dimm);
-void xeon75xx_decode_dimm(struct mce *m, unsigned msize);