arm64: Add RAS driver for ARM64-RAS

RAS are three aspects of the dependability of a system:
   Reliability: continuity of correct service
   Availability: readiness for correct service
   Serviceability: ability to undergo modifications and repairs

ARMv8.2 provides RAS extensions to achieve the above features in a system.

arm64_ras driver allows you to handle Correctable errors using per core
Fault Handling Interrupt (or FHI). Un-correactable errors are handled by
raising SError exception to kernel or Trusted Firmware(EL3) on CCLEX.

The driver provides an API for CPU specific RAS drivers to
register callbacks in case of FHI. When FHI occurs, the FHI
ISR goes through the list of registered callbacks and executes
them. Similarly, the driver allows registering SError callbacks
for Un-correctable errors within Core, Core-Cluster & CCPLEX.
When SError is reported, then those callbacks are scanned to
find and report error in a Core, Cluster or CCPLEX

Bug 3625675

Change-Id: I6dba7178fa7bbcf55bfa083be1b077874f4865c9
Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nv-oot/+/2705396
GVS: Gerrit_Virtual_Submit
This commit is contained in:
Laxman Dewangan
2022-04-30 17:58:11 +00:00
committed by mobile promotions
parent 527743a5c3
commit 3637fb99cb
4 changed files with 605 additions and 0 deletions

View File

@@ -8,6 +8,7 @@ obj-m += mfd/
obj-m += pinctrl/
obj-m += platform/tegra/
obj-m += pwm/
obj-m += ras/
obj-m += thermal/
obj-m += watchdog/

4
drivers/ras/Makefile Normal file
View File

@@ -0,0 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
obj-m += arm64-ras.o

515
drivers/ras/arm64-ras.c Normal file
View File

@@ -0,0 +1,515 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/of_device.h>
#include <linux/arm64-ras.h>
#include <linux/of_irq.h>
#include <linux/cpuhotplug.h>
static int fhi_irq[CONFIG_NR_CPUS];
static u8 is_ras_probe_done;
static LIST_HEAD(fhi_callback_list);
static DEFINE_RAW_SPINLOCK(fhi_lock);
/* saved hotplug state */
static enum cpuhp_state hp_state;
static const struct of_device_id arm64_ras_match[] = {
{
.name = "arm64_ras",
.compatible = "arm,armv8.2-ras",
},
{ },
};
MODULE_DEVICE_TABLE(of, arm64_ras_match);
/* Architecturally defined primary error code
* Used for decoding ERR<x>STATUS:SERR bits
*/
static struct ras_error serr_errors[] = {
{.name = "No error", .error_code = 0x0},
{.name = "Implementation defined error", .error_code = 0x1},
{.name = "Data value from non-associative internal memory",
.error_code = 0x2},
{.name = "Implementation defined pin", .error_code = 0x3},
{.name = "Assertion Failure", .error_code = 0x4},
{.name = "Internal Data Path", .error_code = 0x5},
{.name = "Data value from associative memory", .error_code = 0x6},
{.name = "Address/control value(s) from associative memory",
.error_code = 0x7},
{.name = "Data value from a TLB", .error_code = 0x8},
{.name = "Address/control value(s) from a TLB", .error_code = 0x9},
{.name = "Data value from producer", .error_code = 0xA},
{.name = "Address/control value(s) from producer", .error_code = 0xB},
{.name = "Data value from (non-associative) external memory",
.error_code = 0xC},
{.name = "Illegal address (software fault)", .error_code = 0xD},
{.name = "Illegal access (software fault)", .error_code = 0xE},
{.name = "Illegal state (software fault)", .error_code = 0xF},
{.name = "Internal data register", .error_code = 0x10},
{.name = "Internal control register", .error_code = 0x11},
{.name = "Error response from slave", .error_code = 0x12},
{.name = "External timeout", .error_code = 0x13},
{.name = "Internal timeout", .error_code = 0x14},
{.name = "Deferred error from slave not supported at master",
.error_code = 0x15},
};
/* Read ERR<X>SELR */
u64 ras_read_errselr(void)
{
u64 errselr;
asm volatile("mrs %0, s3_0_c5_c3_1" : "=r" (errselr));
return errselr;
}
EXPORT_SYMBOL(ras_read_errselr);
/* For read/write of any ERR<X> register, error record X is selected
* by writing to ERRSELR_EL1
* ISB is needed in the end to ensure this write completes before another
* read/write to this error record comes in.
*/
void ras_write_errselr(u64 errx)
{
asm volatile("msr s3_0_c5_c3_1, %0" : : "r" (errx));
isb();
}
EXPORT_SYMBOL(ras_write_errselr);
/* Read ERR<X>STATUS */
u64 ras_read_error_status(void)
{
u64 status;
asm volatile("mrs %0, s3_0_c5_c4_2" : "=r" (status));
return status;
}
EXPORT_SYMBOL(ras_read_error_status);
/* Write to ERR<X>STATUS */
void ras_write_error_status(u64 status)
{
asm volatile("msr s3_0_c5_c4_2, %0" : : "r" (status));
}
EXPORT_SYMBOL(ras_write_error_status);
/* Read ERR<X>ADDR */
static u64 ras_read_error_address(void)
{
u64 addr;
asm volatile("mrs %0, s3_0_c5_c4_3" : "=r" (addr));
return addr;
}
/* Write to ERR<X>ADDR*/
void ras_write_error_addr(u64 addr)
{
asm volatile("msr s3_0_c5_c4_3, %0" : : "r" (addr));
}
EXPORT_SYMBOL(ras_write_error_addr);
/* Read ERR<X>MISC0 */
static u64 ras_read_error_misc0(void)
{
u64 misc0;
asm volatile("mrs %0, s3_0_c5_c5_0" : "=r" (misc0));
return misc0;
}
/* Write to ERR<X>MISC0*/
void ras_write_error_misc0(u64 misc0)
{
asm volatile("msr s3_0_c5_c5_0, %0" : : "r" (misc0));
}
EXPORT_SYMBOL(ras_write_error_misc0);
/* Read ERR<X>MISC1 */
static u64 ras_read_error_misc1(void)
{
u64 misc1;
asm volatile("mrs %0, s3_0_c5_c5_1" : "=r" (misc1));
return misc1;
}
/* Write to ERR<X>MISC1*/
void ras_write_error_misc1(u64 misc1)
{
asm volatile("msr s3_0_c5_c5_1, %0" : : "r" (misc1));
}
EXPORT_SYMBOL(ras_write_error_misc1);
/* Read ERR<X>CTLR_EL1*/
u64 ras_read_error_control(void)
{
u64 err_ctl;
asm volatile("mrs %0, s3_0_c5_c4_1" : "=r" (err_ctl));
return err_ctl;
}
EXPORT_SYMBOL(ras_read_error_control);
/* Write to ERR<X>CTLR_EL1*/
void ras_write_error_control(u64 err_ctl)
{
asm volatile("msr s3_0_c5_c4_1, %0" : : "r" (err_ctl));
}
EXPORT_SYMBOL(ras_write_error_control);
/* Read ERR<X>PFGCTL_EL1*/
u64 ras_read_pfg_control(void)
{
u64 pfg_ctl;
asm volatile("mrs %0, s3_0_c15_c1_4" : "=r" (pfg_ctl));
return pfg_ctl;
}
EXPORT_SYMBOL(ras_read_pfg_control);
/* Write to ERR<X>PFGCTL_EL1*/
void ras_write_pfg_control(u64 pfg_ctl)
{
asm volatile("msr s3_0_c15_c1_4, %0" : : "r" (pfg_ctl));
}
EXPORT_SYMBOL(ras_write_pfg_control);
/* Read ERR<X>PFGCDN_EL1*/
u64 ras_read_pfg_cdn(void)
{
u64 pfg_cdn;
asm volatile("mrs %0, s3_0_c15_c1_6" : "=r" (pfg_cdn));
return pfg_cdn;
}
EXPORT_SYMBOL(ras_read_pfg_cdn);
/* Write to ERR<X>PFGCDN_EL1*/
void ras_write_pfg_cdn(u64 pfg_cdn)
{
asm volatile("msr s3_0_c15_c1_6, %0" : : "r" (pfg_cdn));
}
EXPORT_SYMBOL(ras_write_pfg_cdn);
/* Read ERR<X>FR_EL1*/
u64 ras_read_feature_reg(void)
{
u64 fr;
asm volatile("mrs %0, s3_0_c5_c4_0" : "=r" (fr));
return fr;
}
EXPORT_SYMBOL(ras_read_feature_reg);
void unregister_fhi_callback(struct ras_fhi_callback *callback)
{
unsigned long flags;
raw_spin_lock_irqsave(&fhi_lock, flags);
list_del(&callback->node);
raw_spin_unlock_irqrestore(&fhi_lock, flags);
}
EXPORT_SYMBOL(unregister_fhi_callback);
static unsigned int read_pfr0_ras_version(void)
{
return PFR0_RAS(read_cpuid(ID_AA64PFR0_EL1));
}
static void is_ras_version1(void *ret)
{
*(int *)(ret) = (read_pfr0_ras_version() == PFR0_RAS_VERSION_1);
}
/* check if RAS is supported on this CPU
* ret = 0 if not supported
* = 1 if supported
*/
int is_ras_cpu(int cpu)
{
int ret;
smp_call_function_single(cpu, is_ras_version1, &ret, 1);
return ret;
}
EXPORT_SYMBOL(is_ras_cpu);
int is_this_ras_cpu(void)
{
return read_pfr0_ras_version() == PFR0_RAS_VERSION_1;
}
EXPORT_SYMBOL(is_this_ras_cpu);
int is_ras_ready(void)
{
return is_ras_probe_done == 1;
}
EXPORT_SYMBOL(is_ras_ready);
void print_error_record(struct error_record *record, u64 status, int errselr)
{
struct ras_error *errors;
u64 misc0, misc1, addr;
u16 ierr, serr;
u64 i;
int found = 0;
u64 err_status = 0;
pr_crit("**************************************\n");
pr_crit("RAS Error in %s, ERRSELR_EL1=%d:\n", record->name, errselr);
pr_crit("\tStatus = 0x%llx\n", status);
/* Find the name of IERR */
ierr = get_error_status_ierr(status);
errors = record->errors;
if (errors) {
for (i = 0; errors[i].name; i++) {
if (errors[i].error_code == ierr) {
pr_crit("\tIERR = %s: 0x%x\n",
errors[i].name, ierr);
found = 1;
break;
}
}
if (!found)
pr_crit("\tUnknown IERR: 0x%x\n", ierr);
} else {
pr_crit("\tBank does not have any known IERR errors\n");
}
/* Find the name of SERR */
found = 0;
serr = get_error_status_serr(status);
for (i = 0; serr_errors[i].name; i++) {
if (serr_errors[i].error_code == serr) {
pr_crit("\tSERR = %s: 0x%x\n", serr_errors[i].name,
serr);
found = 1;
break;
}
}
if (!found)
pr_crit("\tUnknown SERR: 0x%x\n", serr);
if (status & ERRi_STATUS_OF) {
pr_crit("\tOverflow (there may be more errors) - Uncorrectable\n");
/* Clear the error by writing 1 to ERRi_STATUS:OF */
err_status |= ERRi_STATUS_OF;
}
if (status & ERRi_STATUS_UE) {
pr_crit("\tUncorrectable (this is fatal)\n");
/* Clear the error by writing 1 to ERR_STATUS:UE
* and 11 to ERR_STATUS:UET
*/
err_status |= ERRi_STATUS_UE;
err_status |= ERRi_STATUS_UET;
}
if (get_error_status_ce(status)) {
pr_crit("\tCorrectable Error\n");
/* Clear the error by writing 1 to ERR_STATUS:CE */
err_status |= ERRi_STATUS_CE;
}
if (status & ERRi_STATUS_MV) {
misc0 = ras_read_error_misc0();
misc1 = ras_read_error_misc1();
pr_crit("\tMISC0 = 0x%llx\n", misc0);
pr_crit("\tMISC1 = 0x%llx\n", misc1);
/* write 1 to clear ERR_STATUS:MV */
err_status |= ERRi_STATUS_MV;
}
if (status & ERRi_STATUS_AV) {
addr = ras_read_error_address();
pr_crit("\tADDR = 0x%llx\n", addr);
/* write 1 to clear ERR_STATUS:AV */
err_status |= ERRi_STATUS_AV;
}
/* Write 1 to clear ERR_STATUS:V */
err_status |= ERRi_STATUS_VALID;
/* Write to ERR_STATUS to clear the error */
ras_write_error_status(err_status);
pr_crit("**************************************\n");
}
EXPORT_SYMBOL(print_error_record);
/* Fault Handling Interrupt or FHI
* for handling Correctable Errors
*/
static irqreturn_t ras_fhi_isr(int irq, void *dev_id)
{
unsigned long flags;
struct ras_fhi_callback *callback;
/* Iterate through the banks looking for one with an error */
pr_crit("CPU%d: RAS: FHI %d detected\n", smp_processor_id(), irq);
raw_spin_lock_irqsave(&fhi_lock, flags);
list_for_each_entry(callback, &fhi_callback_list, node) {
callback->fn();
}
raw_spin_unlock_irqrestore(&fhi_lock, flags);
return IRQ_HANDLED;
}
static int ras_fhi_enable(unsigned int cpu)
{
if (irq_force_affinity(fhi_irq[cpu], cpumask_of(cpu))) {
pr_info("%s: Failed to set IRQ %d affinity to CPU%d\n",
__func__, fhi_irq[cpu], cpu);
return -EINVAL;
}
enable_irq(fhi_irq[cpu]);
pr_info("%s: FHI %d enabled on CPU%d\n", __func__, fhi_irq[cpu], cpu);
return 0;
}
static int ras_fhi_disable(unsigned int cpu)
{
int boot_cpu = 0;
disable_irq_nosync(fhi_irq[cpu]);
if (irq_force_affinity(fhi_irq[cpu], cpumask_of(boot_cpu)))
pr_info("%s: Failed to set IRQ %d affinity to boot cpu %d\n",
__func__, fhi_irq[cpu], boot_cpu);
pr_info("%s: FHI %d disabled\n", __func__, fhi_irq[cpu]);
return 0;
}
/* Register Fault Handling Interrupt or FHI
* for Correctable Errors
*/
static int ras_register_fhi_isr(struct platform_device *pdev)
{
int err = 0, cpu, irq_count;
irq_count = platform_irq_count(pdev);
for_each_possible_cpu(cpu) {
BUG_ON(cpu >= irq_count);
fhi_irq[cpu] = platform_get_irq(pdev, cpu);
if (fhi_irq[cpu] <= 0) {
dev_err(&pdev->dev, "No IRQ\n");
err = -ENOENT;
goto isr_err;
}
}
isr_err:
return err;
}
/* This is an API for CPU specific FHI callbacks
* to be registered with fhi_isr handler
*/
int register_fhi_callback(struct ras_fhi_callback *callback, void *cookie)
{
unsigned long flags;
int err = 0, cpu = 0;
raw_spin_lock_irqsave(&fhi_lock, flags);
list_add(&callback->node, &fhi_callback_list);
raw_spin_unlock_irqrestore(&fhi_lock, flags);
for_each_possible_cpu(cpu) {
err = request_irq(fhi_irq[cpu], ras_fhi_isr,
IRQ_PER_CPU, "ras-fhi", cookie);
if (err) {
pr_err("%s: request_irq(%d) failed (%d)\n", __func__,
fhi_irq[cpu], err);
goto isr_err;
}
disable_irq(fhi_irq[cpu]);
}
/* Ensure that any CPU brought online sets up FHI */
hp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"arm64_ras:online",
ras_fhi_enable,
ras_fhi_disable);
if (hp_state < 0)
pr_info("%s: unable to register FHI\n", __func__);
isr_err:
return err;
}
EXPORT_SYMBOL(register_fhi_callback);
static int ras_probe(struct platform_device *pdev)
{
const struct of_device_id *match;
int err = 0, cpu, probe = 0;
struct device *dev = &pdev->dev;
/* probe only if RAS is supported on either of the online CPUs */
for_each_online_cpu(cpu) {
if (is_ras_cpu(cpu))
probe = 1;
}
if (!probe) {
dev_info(dev, "None of the CPUs support RAS");
return -EINVAL;
}
/* register FHI for Correctable Errors */
match = of_match_device(of_match_ptr(arm64_ras_match),
dev);
if (!match) {
dev_err(dev, "RAS detected, but no device-tree node found,"
" Cannot register RAS fault handler interrupt (FHI)");
return -ENODEV;
}
err = ras_register_fhi_isr(pdev);
if (err < 0) {
dev_info(dev, "Failed to register Fault Handling Interrupts ISR");
return err;
}
/* make sure we have executed everything in the probe
* before setting is_ras_probe_done
*/
smp_mb();
is_ras_probe_done = 1;
dev_info(dev, "probed");
return 0;
}
static int ras_remove(struct platform_device *pdev)
{
cpuhp_remove_state(hp_state);
return 0;
}
static struct platform_driver ras_driver = {
.probe = ras_probe,
.remove = ras_remove,
.driver = {
.owner = THIS_MODULE,
.name = "arm64_ras",
.of_match_table = of_match_ptr(arm64_ras_match),
},
};
static int __init ras_init(void)
{
return platform_driver_register(&ras_driver);
}
static void __exit ras_exit(void)
{
platform_driver_unregister(&ras_driver);
}
arch_initcall(ras_init);
module_exit(ras_exit);
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("ARM64 RAS Driver / FHI handler");
MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com");

85
include/linux/arm64-ras.h Normal file
View File

@@ -0,0 +1,85 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#ifndef _LINUX_ARM64_RAS_H__
#define _LINUX_ARM64_RAS_H__
struct ras_error {
char *name;
u16 error_code;
};
struct error_record {
struct list_head node;
char *name;
u64 errx;
u8 processed;
u64 err_ctlr;
struct ras_error *errors;
};
#define RAS_BIT(_bit_) (1ULL << (_bit_))
#define RAS_MASK(_msb_, _lsb_) \
((RAS_BIT(_msb_+1) - 1ULL) & ~(RAS_BIT(_lsb_) - 1ULL))
#define RAS_EXTRACT(_x_, _msb_, _lsb_) \
((_x_ & RAS_MASK(_msb_, _lsb_)) >> _lsb_)
#define RAS_CTL_CFI RAS_BIT(8)
#define RAS_CTL_UE RAS_BIT(4)
#define RAS_CTL_ED RAS_BIT(0)
#define ERRi_STATUS_UET ((RAS_BIT(20)) | (RAS_BIT(21)))
#define ERRi_STATUS_CE ((RAS_BIT(24)) | (RAS_BIT(25)))
#define ERRi_STATUS_MV RAS_BIT(26)
#define ERRi_STATUS_OF RAS_BIT(27)
#define ERRi_STATUS_UE RAS_BIT(29)
#define ERRi_STATUS_VALID RAS_BIT(30)
#define ERRi_STATUS_AV RAS_BIT(31)
#define ERRi_PFGCTL_CDNEN RAS_BIT(31)
#define ERRi_PFGCTL_R RAS_BIT(30)
#define ERRi_PFGCTL_CE RAS_BIT(6)
#define ERRi_PFGCTL_UC RAS_BIT(1)
#define ERRi_PFGCDN_CDN_1 0x1
#define get_error_status_ce(_x_) RAS_EXTRACT(_x_, 25, 24)
#define get_error_status_ierr(_x_) RAS_EXTRACT(_x_, 15, 8)
#define get_error_status_serr(_x_) RAS_EXTRACT(_x_, 7, 0)
struct ras_fhi_callback {
struct list_head node;
void (*fn)(void);
};
/* Macros for reading ID_PFR0 - RAS Version field */
#define PFR0_RAS_SHIFT 28
#define PFR0_RAS_MASK (0xf << PFR0_RAS_SHIFT)
#define PFR0_RAS(pfr0) \
(((pfr0) & PFR0_RAS_MASK) >> PFR0_RAS_SHIFT)
#define PFR0_RAS_VERSION_1 0x1
/* RAS functions needed by ras_carmel driver */
int is_ras_ready(void);
int is_this_ras_cpu(void);
int is_ras_cpu(int cpu);
u64 ras_read_error_status(void);
u64 ras_read_errselr(void);
u64 ras_read_pfg_control(void);
u64 ras_read_pfg_cdn(void);
u64 ras_read_error_control(void);
u64 ras_read_feature_reg(void);
void ras_write_error_control(u64 err_ctl);
void ras_write_error_status(u64 status);
void ras_write_error_addr(u64 addr);
void ras_write_error_misc0(u64 misc0);
void ras_write_error_misc1(u64 misc1);
void ras_write_error_statustrigger(u64 status);
void ras_write_pfg_control(u64 pfg_ctl);
void ras_write_pfg_cdn(u64 pfg_cdn);
void ras_write_errselr(u64 errx);
void print_error_record(struct error_record *record, u64 status, int errselr);
int register_fhi_callback(struct ras_fhi_callback *callback, void *cookie);
void unregister_fhi_callback(struct ras_fhi_callback *callback);
#endif // _LINUX_ARM64_RAS_H__