diff --git a/drivers/Makefile b/drivers/Makefile index d5c687b8..522ba01f 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -8,6 +8,7 @@ obj-m += mfd/ obj-m += pinctrl/ obj-m += platform/tegra/ obj-m += pwm/ +obj-m += ras/ obj-m += thermal/ obj-m += watchdog/ diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile new file mode 100644 index 00000000..0657a33f --- /dev/null +++ b/drivers/ras/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +obj-m += arm64-ras.o diff --git a/drivers/ras/arm64-ras.c b/drivers/ras/arm64-ras.c new file mode 100644 index 00000000..52124fd8 --- /dev/null +++ b/drivers/ras/arm64-ras.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + +#include +#include +#include +#include +#include +#include + +static int fhi_irq[CONFIG_NR_CPUS]; +static u8 is_ras_probe_done; +static LIST_HEAD(fhi_callback_list); +static DEFINE_RAW_SPINLOCK(fhi_lock); + +/* saved hotplug state */ +static enum cpuhp_state hp_state; + +static const struct of_device_id arm64_ras_match[] = { + { + .name = "arm64_ras", + .compatible = "arm,armv8.2-ras", + }, + { }, +}; +MODULE_DEVICE_TABLE(of, arm64_ras_match); + +/* Architecturally defined primary error code + * Used for decoding ERRSTATUS:SERR bits + */ +static struct ras_error serr_errors[] = { + {.name = "No error", .error_code = 0x0}, + {.name = "Implementation defined error", .error_code = 0x1}, + {.name = "Data value from non-associative internal memory", + .error_code = 0x2}, + {.name = "Implementation defined pin", .error_code = 0x3}, + {.name = "Assertion Failure", .error_code = 0x4}, + {.name = "Internal Data Path", .error_code = 0x5}, + {.name = "Data value from associative memory", .error_code = 0x6}, + {.name = "Address/control value(s) from associative memory", + .error_code = 0x7}, + {.name = "Data value from a TLB", .error_code = 0x8}, + {.name = "Address/control value(s) from a TLB", .error_code = 0x9}, + {.name = "Data value from producer", .error_code = 0xA}, + {.name = "Address/control value(s) from producer", .error_code = 0xB}, + {.name = "Data value from (non-associative) external memory", + .error_code = 0xC}, + {.name = "Illegal address (software fault)", .error_code = 0xD}, + {.name = "Illegal access (software fault)", .error_code = 0xE}, + {.name = "Illegal state (software fault)", .error_code = 0xF}, + {.name = "Internal data register", .error_code = 0x10}, + {.name = "Internal control register", .error_code = 0x11}, + {.name = "Error response from slave", .error_code = 0x12}, + {.name = "External timeout", .error_code = 0x13}, + {.name = "Internal timeout", .error_code = 0x14}, + {.name = "Deferred error from slave not supported at master", + .error_code = 0x15}, +}; + +/* Read ERRSELR */ +u64 ras_read_errselr(void) +{ + u64 errselr; + + asm volatile("mrs %0, s3_0_c5_c3_1" : "=r" (errselr)); + return errselr; +} +EXPORT_SYMBOL(ras_read_errselr); + +/* For read/write of any ERR register, error record X is selected + * by writing to ERRSELR_EL1 + * ISB is needed in the end to ensure this write completes before another + * read/write to this error record comes in. + */ +void ras_write_errselr(u64 errx) +{ + asm volatile("msr s3_0_c5_c3_1, %0" : : "r" (errx)); + isb(); +} +EXPORT_SYMBOL(ras_write_errselr); + +/* Read ERRSTATUS */ +u64 ras_read_error_status(void) +{ + u64 status; + + asm volatile("mrs %0, s3_0_c5_c4_2" : "=r" (status)); + return status; +} +EXPORT_SYMBOL(ras_read_error_status); + +/* Write to ERRSTATUS */ +void ras_write_error_status(u64 status) +{ + asm volatile("msr s3_0_c5_c4_2, %0" : : "r" (status)); +} +EXPORT_SYMBOL(ras_write_error_status); + +/* Read ERRADDR */ +static u64 ras_read_error_address(void) +{ + u64 addr; + + asm volatile("mrs %0, s3_0_c5_c4_3" : "=r" (addr)); + return addr; +} + +/* Write to ERRADDR*/ +void ras_write_error_addr(u64 addr) +{ + asm volatile("msr s3_0_c5_c4_3, %0" : : "r" (addr)); +} +EXPORT_SYMBOL(ras_write_error_addr); + +/* Read ERRMISC0 */ +static u64 ras_read_error_misc0(void) +{ + u64 misc0; + + asm volatile("mrs %0, s3_0_c5_c5_0" : "=r" (misc0)); + return misc0; +} + +/* Write to ERRMISC0*/ +void ras_write_error_misc0(u64 misc0) +{ + asm volatile("msr s3_0_c5_c5_0, %0" : : "r" (misc0)); +} +EXPORT_SYMBOL(ras_write_error_misc0); + +/* Read ERRMISC1 */ +static u64 ras_read_error_misc1(void) +{ + u64 misc1; + + asm volatile("mrs %0, s3_0_c5_c5_1" : "=r" (misc1)); + return misc1; +} + +/* Write to ERRMISC1*/ +void ras_write_error_misc1(u64 misc1) +{ + asm volatile("msr s3_0_c5_c5_1, %0" : : "r" (misc1)); +} +EXPORT_SYMBOL(ras_write_error_misc1); + +/* Read ERRCTLR_EL1*/ +u64 ras_read_error_control(void) +{ + u64 err_ctl; + + asm volatile("mrs %0, s3_0_c5_c4_1" : "=r" (err_ctl)); + return err_ctl; +} +EXPORT_SYMBOL(ras_read_error_control); + +/* Write to ERRCTLR_EL1*/ +void ras_write_error_control(u64 err_ctl) +{ + asm volatile("msr s3_0_c5_c4_1, %0" : : "r" (err_ctl)); +} +EXPORT_SYMBOL(ras_write_error_control); + +/* Read ERRPFGCTL_EL1*/ +u64 ras_read_pfg_control(void) +{ + u64 pfg_ctl; + + asm volatile("mrs %0, s3_0_c15_c1_4" : "=r" (pfg_ctl)); + return pfg_ctl; +} +EXPORT_SYMBOL(ras_read_pfg_control); + +/* Write to ERRPFGCTL_EL1*/ +void ras_write_pfg_control(u64 pfg_ctl) +{ + asm volatile("msr s3_0_c15_c1_4, %0" : : "r" (pfg_ctl)); +} +EXPORT_SYMBOL(ras_write_pfg_control); + +/* Read ERRPFGCDN_EL1*/ +u64 ras_read_pfg_cdn(void) +{ + u64 pfg_cdn; + + asm volatile("mrs %0, s3_0_c15_c1_6" : "=r" (pfg_cdn)); + return pfg_cdn; +} +EXPORT_SYMBOL(ras_read_pfg_cdn); + +/* Write to ERRPFGCDN_EL1*/ +void ras_write_pfg_cdn(u64 pfg_cdn) +{ + asm volatile("msr s3_0_c15_c1_6, %0" : : "r" (pfg_cdn)); +} +EXPORT_SYMBOL(ras_write_pfg_cdn); + +/* Read ERRFR_EL1*/ +u64 ras_read_feature_reg(void) +{ + u64 fr; + + asm volatile("mrs %0, s3_0_c5_c4_0" : "=r" (fr)); + return fr; +} +EXPORT_SYMBOL(ras_read_feature_reg); + +void unregister_fhi_callback(struct ras_fhi_callback *callback) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&fhi_lock, flags); + list_del(&callback->node); + raw_spin_unlock_irqrestore(&fhi_lock, flags); +} +EXPORT_SYMBOL(unregister_fhi_callback); + +static unsigned int read_pfr0_ras_version(void) +{ + return PFR0_RAS(read_cpuid(ID_AA64PFR0_EL1)); +} + +static void is_ras_version1(void *ret) +{ + *(int *)(ret) = (read_pfr0_ras_version() == PFR0_RAS_VERSION_1); +} + +/* check if RAS is supported on this CPU + * ret = 0 if not supported + * = 1 if supported + */ +int is_ras_cpu(int cpu) +{ + int ret; + + smp_call_function_single(cpu, is_ras_version1, &ret, 1); + return ret; +} +EXPORT_SYMBOL(is_ras_cpu); + +int is_this_ras_cpu(void) +{ + return read_pfr0_ras_version() == PFR0_RAS_VERSION_1; +} +EXPORT_SYMBOL(is_this_ras_cpu); + +int is_ras_ready(void) +{ + return is_ras_probe_done == 1; +} +EXPORT_SYMBOL(is_ras_ready); + +void print_error_record(struct error_record *record, u64 status, int errselr) +{ + struct ras_error *errors; + u64 misc0, misc1, addr; + u16 ierr, serr; + u64 i; + int found = 0; + u64 err_status = 0; + + pr_crit("**************************************\n"); + pr_crit("RAS Error in %s, ERRSELR_EL1=%d:\n", record->name, errselr); + pr_crit("\tStatus = 0x%llx\n", status); + + /* Find the name of IERR */ + ierr = get_error_status_ierr(status); + errors = record->errors; + if (errors) { + for (i = 0; errors[i].name; i++) { + if (errors[i].error_code == ierr) { + pr_crit("\tIERR = %s: 0x%x\n", + errors[i].name, ierr); + found = 1; + break; + } + } + if (!found) + pr_crit("\tUnknown IERR: 0x%x\n", ierr); + } else { + pr_crit("\tBank does not have any known IERR errors\n"); + } + + /* Find the name of SERR */ + found = 0; + serr = get_error_status_serr(status); + + for (i = 0; serr_errors[i].name; i++) { + if (serr_errors[i].error_code == serr) { + pr_crit("\tSERR = %s: 0x%x\n", serr_errors[i].name, + serr); + found = 1; + break; + } + } + if (!found) + pr_crit("\tUnknown SERR: 0x%x\n", serr); + + if (status & ERRi_STATUS_OF) { + pr_crit("\tOverflow (there may be more errors) - Uncorrectable\n"); + /* Clear the error by writing 1 to ERRi_STATUS:OF */ + err_status |= ERRi_STATUS_OF; + } + if (status & ERRi_STATUS_UE) { + pr_crit("\tUncorrectable (this is fatal)\n"); + /* Clear the error by writing 1 to ERR_STATUS:UE + * and 11 to ERR_STATUS:UET + */ + err_status |= ERRi_STATUS_UE; + err_status |= ERRi_STATUS_UET; + } + if (get_error_status_ce(status)) { + pr_crit("\tCorrectable Error\n"); + /* Clear the error by writing 1 to ERR_STATUS:CE */ + err_status |= ERRi_STATUS_CE; + } + if (status & ERRi_STATUS_MV) { + misc0 = ras_read_error_misc0(); + misc1 = ras_read_error_misc1(); + pr_crit("\tMISC0 = 0x%llx\n", misc0); + pr_crit("\tMISC1 = 0x%llx\n", misc1); + /* write 1 to clear ERR_STATUS:MV */ + err_status |= ERRi_STATUS_MV; + } + if (status & ERRi_STATUS_AV) { + addr = ras_read_error_address(); + pr_crit("\tADDR = 0x%llx\n", addr); + /* write 1 to clear ERR_STATUS:AV */ + err_status |= ERRi_STATUS_AV; + } + + /* Write 1 to clear ERR_STATUS:V */ + err_status |= ERRi_STATUS_VALID; + + /* Write to ERR_STATUS to clear the error */ + ras_write_error_status(err_status); + pr_crit("**************************************\n"); +} +EXPORT_SYMBOL(print_error_record); + +/* Fault Handling Interrupt or FHI + * for handling Correctable Errors + */ +static irqreturn_t ras_fhi_isr(int irq, void *dev_id) +{ + unsigned long flags; + struct ras_fhi_callback *callback; + + /* Iterate through the banks looking for one with an error */ + pr_crit("CPU%d: RAS: FHI %d detected\n", smp_processor_id(), irq); + + raw_spin_lock_irqsave(&fhi_lock, flags); + list_for_each_entry(callback, &fhi_callback_list, node) { + callback->fn(); + } + raw_spin_unlock_irqrestore(&fhi_lock, flags); + + return IRQ_HANDLED; +} + +static int ras_fhi_enable(unsigned int cpu) +{ + if (irq_force_affinity(fhi_irq[cpu], cpumask_of(cpu))) { + pr_info("%s: Failed to set IRQ %d affinity to CPU%d\n", + __func__, fhi_irq[cpu], cpu); + return -EINVAL; + } + enable_irq(fhi_irq[cpu]); + pr_info("%s: FHI %d enabled on CPU%d\n", __func__, fhi_irq[cpu], cpu); + return 0; +} + +static int ras_fhi_disable(unsigned int cpu) +{ + int boot_cpu = 0; + + disable_irq_nosync(fhi_irq[cpu]); + if (irq_force_affinity(fhi_irq[cpu], cpumask_of(boot_cpu))) + pr_info("%s: Failed to set IRQ %d affinity to boot cpu %d\n", + __func__, fhi_irq[cpu], boot_cpu); + pr_info("%s: FHI %d disabled\n", __func__, fhi_irq[cpu]); + return 0; +} + +/* Register Fault Handling Interrupt or FHI + * for Correctable Errors + */ +static int ras_register_fhi_isr(struct platform_device *pdev) +{ + int err = 0, cpu, irq_count; + + irq_count = platform_irq_count(pdev); + + for_each_possible_cpu(cpu) { + BUG_ON(cpu >= irq_count); + fhi_irq[cpu] = platform_get_irq(pdev, cpu); + if (fhi_irq[cpu] <= 0) { + dev_err(&pdev->dev, "No IRQ\n"); + err = -ENOENT; + goto isr_err; + } + } + +isr_err: + return err; +} + +/* This is an API for CPU specific FHI callbacks + * to be registered with fhi_isr handler + */ +int register_fhi_callback(struct ras_fhi_callback *callback, void *cookie) +{ + unsigned long flags; + int err = 0, cpu = 0; + + raw_spin_lock_irqsave(&fhi_lock, flags); + list_add(&callback->node, &fhi_callback_list); + raw_spin_unlock_irqrestore(&fhi_lock, flags); + + for_each_possible_cpu(cpu) { + err = request_irq(fhi_irq[cpu], ras_fhi_isr, + IRQ_PER_CPU, "ras-fhi", cookie); + if (err) { + pr_err("%s: request_irq(%d) failed (%d)\n", __func__, + fhi_irq[cpu], err); + goto isr_err; + } + disable_irq(fhi_irq[cpu]); + } + /* Ensure that any CPU brought online sets up FHI */ + hp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64_ras:online", + ras_fhi_enable, + ras_fhi_disable); + + if (hp_state < 0) + pr_info("%s: unable to register FHI\n", __func__); + +isr_err: + return err; +} +EXPORT_SYMBOL(register_fhi_callback); + +static int ras_probe(struct platform_device *pdev) +{ + const struct of_device_id *match; + int err = 0, cpu, probe = 0; + struct device *dev = &pdev->dev; + + /* probe only if RAS is supported on either of the online CPUs */ + for_each_online_cpu(cpu) { + if (is_ras_cpu(cpu)) + probe = 1; + } + + if (!probe) { + dev_info(dev, "None of the CPUs support RAS"); + return -EINVAL; + } + + /* register FHI for Correctable Errors */ + match = of_match_device(of_match_ptr(arm64_ras_match), + dev); + if (!match) { + dev_err(dev, "RAS detected, but no device-tree node found," + " Cannot register RAS fault handler interrupt (FHI)"); + return -ENODEV; + } + err = ras_register_fhi_isr(pdev); + if (err < 0) { + dev_info(dev, "Failed to register Fault Handling Interrupts ISR"); + return err; + } + + /* make sure we have executed everything in the probe + * before setting is_ras_probe_done + */ + smp_mb(); + is_ras_probe_done = 1; + dev_info(dev, "probed"); + return 0; +} + +static int ras_remove(struct platform_device *pdev) +{ + cpuhp_remove_state(hp_state); + return 0; +} + +static struct platform_driver ras_driver = { + .probe = ras_probe, + .remove = ras_remove, + .driver = { + .owner = THIS_MODULE, + .name = "arm64_ras", + .of_match_table = of_match_ptr(arm64_ras_match), + }, +}; + +static int __init ras_init(void) +{ + return platform_driver_register(&ras_driver); +} + +static void __exit ras_exit(void) +{ + platform_driver_unregister(&ras_driver); +} + +arch_initcall(ras_init); +module_exit(ras_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ARM64 RAS Driver / FHI handler"); +MODULE_AUTHOR("Laxman Dewangan > _lsb_) + +#define RAS_CTL_CFI RAS_BIT(8) +#define RAS_CTL_UE RAS_BIT(4) +#define RAS_CTL_ED RAS_BIT(0) + +#define ERRi_STATUS_UET ((RAS_BIT(20)) | (RAS_BIT(21))) +#define ERRi_STATUS_CE ((RAS_BIT(24)) | (RAS_BIT(25))) +#define ERRi_STATUS_MV RAS_BIT(26) +#define ERRi_STATUS_OF RAS_BIT(27) +#define ERRi_STATUS_UE RAS_BIT(29) +#define ERRi_STATUS_VALID RAS_BIT(30) +#define ERRi_STATUS_AV RAS_BIT(31) + +#define ERRi_PFGCTL_CDNEN RAS_BIT(31) +#define ERRi_PFGCTL_R RAS_BIT(30) +#define ERRi_PFGCTL_CE RAS_BIT(6) +#define ERRi_PFGCTL_UC RAS_BIT(1) + +#define ERRi_PFGCDN_CDN_1 0x1 + +#define get_error_status_ce(_x_) RAS_EXTRACT(_x_, 25, 24) +#define get_error_status_ierr(_x_) RAS_EXTRACT(_x_, 15, 8) +#define get_error_status_serr(_x_) RAS_EXTRACT(_x_, 7, 0) + +struct ras_fhi_callback { + struct list_head node; + void (*fn)(void); +}; + +/* Macros for reading ID_PFR0 - RAS Version field */ +#define PFR0_RAS_SHIFT 28 +#define PFR0_RAS_MASK (0xf << PFR0_RAS_SHIFT) +#define PFR0_RAS(pfr0) \ + (((pfr0) & PFR0_RAS_MASK) >> PFR0_RAS_SHIFT) +#define PFR0_RAS_VERSION_1 0x1 + +/* RAS functions needed by ras_carmel driver */ +int is_ras_ready(void); +int is_this_ras_cpu(void); +int is_ras_cpu(int cpu); +u64 ras_read_error_status(void); +u64 ras_read_errselr(void); +u64 ras_read_pfg_control(void); +u64 ras_read_pfg_cdn(void); +u64 ras_read_error_control(void); +u64 ras_read_feature_reg(void); +void ras_write_error_control(u64 err_ctl); +void ras_write_error_status(u64 status); +void ras_write_error_addr(u64 addr); +void ras_write_error_misc0(u64 misc0); +void ras_write_error_misc1(u64 misc1); +void ras_write_error_statustrigger(u64 status); +void ras_write_pfg_control(u64 pfg_ctl); +void ras_write_pfg_cdn(u64 pfg_cdn); +void ras_write_errselr(u64 errx); +void print_error_record(struct error_record *record, u64 status, int errselr); +int register_fhi_callback(struct ras_fhi_callback *callback, void *cookie); +void unregister_fhi_callback(struct ras_fhi_callback *callback); + +#endif // _LINUX_ARM64_RAS_H__