diff --git a/Makefile.umbrella.tmk b/Makefile.umbrella.tmk
index 5e5541202..89aa90c38 100644
--- a/Makefile.umbrella.tmk
+++ b/Makefile.umbrella.tmk
@@ -38,6 +38,7 @@ NV_REPOSITORY_COMPONENTS += userspace/units/posix/bitops
 NV_REPOSITORY_COMPONENTS += userspace/units/posix/env
 NV_REPOSITORY_COMPONENTS += userspace/units/posix/mockio
 NV_REPOSITORY_COMPONENTS += userspace/units/posix/fault-injection
+NV_REPOSITORY_COMPONENTS += userspace/units/interface/atomic
 NV_REPOSITORY_COMPONENTS += userspace/units/pramin
 NV_REPOSITORY_COMPONENTS += userspace/units/mm/nvgpu_sgt
 NV_REPOSITORY_COMPONENTS += userspace/units/mm/nvgpu_mem
diff --git a/userspace/Makefile.sources b/userspace/Makefile.sources
index 88431c530..eb5e46b2d 100644
--- a/userspace/Makefile.sources
+++ b/userspace/Makefile.sources
@@ -53,6 +53,7 @@ UNITS :=				\
 	$(UNIT_SRC)/posix/fault-injection \
 	$(UNIT_SRC)/pramin		\
 	$(UNIT_SRC)/fuse		\
+	$(UNIT_SRC)/interface/atomic	\
 	$(UNIT_SRC)/mm/nvgpu_sgt	\
 	$(UNIT_SRC)/mm/allocators/nvgpu_allocator	\
 	$(UNIT_SRC)/mm/gmmu/pd_cache	\
diff --git a/userspace/required_tests.json b/userspace/required_tests.json
index b688ca0c2..099c4226a 100644
--- a/userspace/required_tests.json
+++ b/userspace/required_tests.json
@@ -1,4 +1,156 @@
 [
+    {
+        "test": "atomic_add_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_unless_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_unless_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_unless_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_add_unless_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_cmpxchg_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_cmpxchg_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_and_test_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_and_test_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_and_test_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_dec_and_test_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_and_test_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_and_test_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_and_test_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_inc_and_test_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_set_and_read_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_set_and_read_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_and_test_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_and_test_32_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_and_test_64",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_sub_and_test_64_threaded",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_xchg_32",
+        "unit": "atomic"
+    },
+    {
+        "test": "atomic_xchg_64",
+        "unit": "atomic"
+    },
     {
         "test": "enabled_flags_false_check",
         "unit": "enabled"
diff --git a/userspace/units/interface/atomic/Makefile b/userspace/units/interface/atomic/Makefile
new file mode 100644
index 000000000..b089b2097
--- /dev/null
+++ b/userspace/units/interface/atomic/Makefile
@@ -0,0 +1,26 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+.SUFFIXES:
+
+OBJS   = atomic.o
+MODULE = atomic
+
+include ../../Makefile.units
diff --git a/userspace/units/interface/atomic/Makefile.interface.tmk b/userspace/units/interface/atomic/Makefile.interface.tmk
new file mode 100644
index 000000000..ae549b780
--- /dev/null
+++ b/userspace/units/interface/atomic/Makefile.interface.tmk
@@ -0,0 +1,23 @@
+################################### tell Emacs this is a -*- makefile-gmake -*-
+#
+# Copyright (c) 2019, NVIDIA CORPORATION.  All Rights Reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# tmake for SW Mobile component makefile
+#
+###############################################################################
+
+NVGPU_UNIT_NAME=atomic
+
+include $(NV_COMPONENT_DIR)/../../Makefile.units.common.interface.tmk
+
+# Local Variables:
+# indent-tabs-mode: t
+# tab-width: 8
+# End:
+# vi: set tabstop=8 noexpandtab:
diff --git a/userspace/units/interface/atomic/Makefile.tmk b/userspace/units/interface/atomic/Makefile.tmk
new file mode 100644
index 000000000..8e03f7d94
--- /dev/null
+++ b/userspace/units/interface/atomic/Makefile.tmk
@@ -0,0 +1,28 @@
+################################### tell Emacs this is a -*- makefile-gmake -*-
+#
+# Copyright (c) 2019, NVIDIA CORPORATION.  All Rights Reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# tmake for SW Mobile component makefile
+#
+###############################################################################
+
+NVGPU_UNIT_NAME=atomic
+NVGPU_UNIT_SRCS=atomic.c
+
+ifneq ($(NV_BUILD_CONFIGURATION_OS_IS_QNX),1)
+NVGPU_UNIT_SHARED_LIBRARIES += pthread
+endif
+
+include $(NV_COMPONENT_DIR)/../../Makefile.units.common.tmk
+
+# Local Variables:
+# indent-tabs-mode: t
+# tab-width: 8
+# End:
+# vi: set tabstop=8 noexpandtab:
diff --git a/userspace/units/interface/atomic/atomic.c b/userspace/units/interface/atomic/atomic.c
new file mode 100644
index 000000000..9c1e3a5bd
--- /dev/null
+++ b/userspace/units/interface/atomic/atomic.c
@@ -0,0 +1,919 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h> /* for abs() */
+#include <unit/unit.h>
+#include <unit/io.h>
+#include <unit/unit-requirement-ids.h>
+
+#include <nvgpu/atomic.h>
+#include <nvgpu/bug.h>
+
+struct atomic_struct {
+	nvgpu_atomic_t atomic;
+	nvgpu_atomic64_t atomic64;
+};
+enum atomic_width {
+	WIDTH_32,
+	WIDTH_64,
+};
+enum atomic_op {
+	op_inc,
+	op_dec,
+	op_add,
+	op_sub,
+	op_inc_and_test,
+	op_dec_and_test,
+	op_sub_and_test,
+	op_add_unless,
+};
+struct atomic_test_args {
+	enum atomic_op op;
+	enum atomic_width width;
+	long start_val;
+	long loop_count;
+	long value; /* for add/sub ops */
+};
+struct atomic_thread_info {
+	struct atomic_struct *atomic;
+	struct atomic_test_args *margs;
+	pthread_t thread;
+	int thread_num;
+	int iterations;
+	long final_val;
+	long unless;
+};
+
+/*
+ * Define macros for atomic ops that have 32b and 64b versions so we can
+ * keep the code cleaner.
+ */
+#define ATOMIC_SET(width, ref, i)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_set(&((ref)->atomic), i) :			\
+		nvgpu_atomic64_set(&((ref)->atomic64), i))
+
+#define ATOMIC_READ(width, ref)						\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_read(&((ref)->atomic)) :			\
+		nvgpu_atomic64_read(&((ref)->atomic64)))
+
+#define ATOMIC_INC(width, ref)						\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_inc(&((ref)->atomic)) :			\
+		nvgpu_atomic64_inc(&((ref)->atomic64)))
+
+#define ATOMIC_INC_RETURN(width, ref)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_inc_return(&((ref)->atomic)) :		\
+		nvgpu_atomic64_inc_return(&((ref)->atomic64)))
+
+#define ATOMIC_INC_AND_TEST(width, ref)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_inc_and_test(&((ref)->atomic)) :		\
+		nvgpu_atomic64_inc_and_test(&((ref)->atomic64)))
+
+#define ATOMIC_DEC(width, ref)						\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_dec(&((ref)->atomic)) :			\
+		nvgpu_atomic64_dec(&((ref)->atomic64)))
+
+#define ATOMIC_DEC_RETURN(width, ref)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_dec_return(&((ref)->atomic)) :		\
+		nvgpu_atomic64_dec_return(&((ref)->atomic64)))
+
+#define ATOMIC_DEC_AND_TEST(width, ref)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_dec_and_test(&((ref)->atomic)) :		\
+		nvgpu_atomic64_dec_and_test(&((ref)->atomic64)))
+
+#define ATOMIC_ADD(width, x, ref)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_add(x, &((ref)->atomic)) :			\
+		nvgpu_atomic64_add(x, &((ref)->atomic64)))
+
+#define ATOMIC_ADD_RETURN(width, x, ref)				\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_add_return(x, &((ref)->atomic)) :		\
+		nvgpu_atomic64_add_return(x, &((ref)->atomic64)))
+
+#define ATOMIC_ADD_UNLESS(width, ref, a, u)				\
+	(((width == WIDTH_32) ?						\
+		nvgpu_atomic_add_unless(&((ref)->atomic), a, u) :	\
+		nvgpu_atomic64_add_unless(&((ref)->atomic64), a, u)))
+
+#define ATOMIC_SUB(width, x, ref)					\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_sub(x, &((ref)->atomic)) :			\
+		nvgpu_atomic64_sub(x, &((ref)->atomic64)))
+
+#define ATOMIC_SUB_RETURN(width, x, ref)				\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_sub_return(x, &((ref)->atomic)) :		\
+		nvgpu_atomic64_sub_return(x, &((ref)->atomic64)))
+
+#define ATOMIC_SUB_AND_TEST(width, x, ref)				\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_sub_and_test(x, &((ref)->atomic)) :	\
+		nvgpu_atomic64_sub_and_test(x, &((ref)->atomic64)))
+
+#define ATOMIC_XCHG(width, ref, new)				\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_xchg(&((ref)->atomic), new) :	\
+		nvgpu_atomic64_xchg(&((ref)->atomic64), new))
+
+#define ATOMIC_CMPXCHG(width, ref, old, new)				\
+	((width == WIDTH_32) ?						\
+		nvgpu_atomic_cmpxchg(&((ref)->atomic), old, new) :	\
+		nvgpu_atomic64_cmpxchg(&((ref)->atomic64), old, new))
+
+/*
+ * Helper macro that takes an atomic op from the enum and returns +1/-1
+ * to help doing arithemtic.
+ */
+#define ATOMIC_OP_SIGN(atomic_op)					\
+	({								\
+		long sign;						\
+		switch (atomic_op) {					\
+			case op_dec:					\
+			case op_sub:					\
+			case op_dec_and_test:				\
+			case op_sub_and_test:				\
+				sign = -1;				\
+				break;					\
+			default:					\
+				sign = 1;				\
+		}							\
+		sign;							\
+	})
+
+/* Support function to do an atomic set and read verification */
+static int single_set_and_read(struct unit_module *m,
+			       struct atomic_struct *atomic,
+			       enum atomic_width width, const long set_val)
+{
+	long read_val;
+
+	if ((width == WIDTH_32) &&
+	    ((set_val < INT_MIN) || (set_val > INT_MAX))) {
+		unit_return_fail(m, "Invalid value for 32 op\n");
+	}
+
+	ATOMIC_SET(width, atomic, set_val);
+	read_val = ATOMIC_READ(width, atomic);
+	if (read_val != set_val) {
+		unit_err(m, "Atomic returned wrong value. Expected: %ld "
+			    "Received: %ld\n", (long)set_val, (long)read_val);
+		return UNIT_FAIL;
+	}
+	return  UNIT_SUCCESS;
+}
+
+/*
+ * Test atomic read and set operations single threaded for proper functionality
+ *
+ * Tests setting the limit values for each size.
+ * Loops through setting each bit in a 32/64bit value.
+ */
+static int test_atomic_set_and_read(struct unit_module *m,
+				    struct gk20a *g, void *__args)
+{
+	struct atomic_test_args *args = (struct atomic_test_args *)__args;
+	const int loop_limit = args->width == WIDTH_32 ? (sizeof(int) * 8) :
+							 (sizeof(long) * 8);
+	const long min_value = args->width == WIDTH_32 ? INT_MIN :
+							 LONG_MIN;
+	const long max_value = args->width == WIDTH_32 ? INT_MAX :
+							 LONG_MAX;
+	struct atomic_struct atomic;
+	int i;
+
+	single_set_and_read(m, &atomic, args->width, min_value);
+	single_set_and_read(m, &atomic, args->width, max_value);
+	single_set_and_read(m, &atomic, args->width, 0);
+
+	for (i = 0; i < loop_limit; i++) {
+		if (single_set_and_read(m, &atomic, args->width, (1 << i))
+							!= UNIT_SUCCESS) {
+			return UNIT_FAIL;
+		}
+	}
+
+	return UNIT_SUCCESS;
+}
+
+/*
+ * Test arithmetic atomic operations single threaded for proper functionality
+ *   inc, dec, add, sub and friends (except add_unless)
+ * Sets a start value from args
+ * Loops (iterations per args param)
+ * Validates final result
+ *
+ * For *_and_test ops, the args should make sure the loop traverses across 0
+ * to test the "test" part.
+ */
+static int test_atomic_arithmetic(struct unit_module *m,
+			       struct gk20a *g, void *__args)
+{
+	struct atomic_test_args *args = (struct atomic_test_args *)__args;
+	struct atomic_struct atomic;
+	int i;
+	long delta_magnitude;
+	long read_val;
+	long expected_val;
+	bool result_bool;
+	bool check_result_bool = false;
+
+	if (single_set_and_read(m, &atomic, args->width, args->start_val)
+							!= UNIT_SUCCESS) {
+		return UNIT_FAIL;
+	}
+
+	for (i = 1; i <= args->loop_count; i++) {
+		if (args->op == op_inc) {
+			/* use 2 since we test both inc and inc_return */
+			delta_magnitude = 2;
+			ATOMIC_INC(args->width, &atomic);
+			read_val = ATOMIC_INC_RETURN(args->width, &atomic);
+		} else if (args->op == op_inc_and_test) {
+			delta_magnitude = 1;
+			check_result_bool = true;
+			result_bool = ATOMIC_INC_AND_TEST(args->width, &atomic);
+			read_val = ATOMIC_READ(args->width, &atomic);
+		} else if (args->op == op_dec) {
+			/* use 2 since we test both dec and dec_return */
+			delta_magnitude = 2;
+			ATOMIC_DEC(args->width, &atomic);
+			read_val = ATOMIC_DEC_RETURN(args->width, &atomic);
+		} else if (args->op == op_dec_and_test) {
+			delta_magnitude = 1;
+			check_result_bool = true;
+			result_bool = ATOMIC_DEC_AND_TEST(args->width, &atomic);
+			read_val = ATOMIC_READ(args->width, &atomic);
+		} else if (args->op == op_add) {
+			delta_magnitude = args->value * 2;
+			ATOMIC_ADD(args->width, args->value, &atomic);
+			read_val = ATOMIC_ADD_RETURN(args->width, args->value,
+							&atomic);
+		} else if (args->op == op_sub) {
+			delta_magnitude = args->value * 2;
+			ATOMIC_SUB(args->width, args->value, &atomic);
+			read_val = ATOMIC_SUB_RETURN(args->width, args->value,
+							&atomic);
+		} else if (args->op == op_sub_and_test) {
+			delta_magnitude = args->value;
+			check_result_bool = true;
+			result_bool = ATOMIC_SUB_AND_TEST(args->width,
+						args->value, &atomic);
+			read_val = ATOMIC_READ(args->width, &atomic);
+		} else {
+			unit_return_fail(m, "Test error: invalid op in %s\n",
+					__func__);
+		}
+
+		expected_val = args->start_val +
+			(i * delta_magnitude * ATOMIC_OP_SIGN(args->op));
+
+		/* sanity check */
+		if ((args->width == WIDTH_32) &&
+		    ((expected_val > INT_MAX) || (expected_val < INT_MIN))) {
+			unit_return_fail(m, "Test error: invalid value in %s\n",
+					__func__);
+		}
+
+		if (read_val != expected_val) {
+			unit_return_fail(m, "Atomic returned wrong value. "
+				    "Expected: %ld Received: %ld\n",
+				    (long)expected_val, (long)read_val);
+		}
+
+		if (check_result_bool) {
+			if (((expected_val == 0) && !result_bool) ||
+			    ((expected_val != 0) && result_bool)) {
+				    unit_return_fail(m,
+						"Test result incorrect\n");
+			    }
+		}
+	}
+
+	return UNIT_SUCCESS;
+}
+
+/*
+ * Support function that runs in the threads for the arithmetic threaded
+ * test below
+ */
+static void *arithmetic_thread(void *__args)
+{
+	struct atomic_thread_info *targs = (struct atomic_thread_info *)__args;
+	int i;
+
+	for (i = 0; i < targs->margs->loop_count; i++) {
+		if (targs->margs->op == op_inc) {
+			ATOMIC_INC(targs->margs->width, targs->atomic);
+		} else if (targs->margs->op == op_dec) {
+			ATOMIC_DEC(targs->margs->width, targs->atomic);
+		} else if (targs->margs->op == op_add) {
+			/*
+			 * Save the last value to sanity that threads aren't
+			 * running sequentially
+			 */
+			targs->final_val = ATOMIC_ADD_RETURN(
+						targs->margs->width,
+						targs->margs->value,
+						targs->atomic);
+		} else if (targs->margs->op == op_add) {
+			ATOMIC_ADD(targs->margs->width, targs->margs->value,
+					targs->atomic);
+		} else if (targs->margs->op == op_sub) {
+			ATOMIC_SUB(targs->margs->width, targs->margs->value,
+					targs->atomic);
+		} else if (targs->margs->op == op_inc_and_test) {
+			if (ATOMIC_INC_AND_TEST(targs->margs->width,
+							targs->atomic)) {
+				/*
+				 * Only increment if atomic op returns true
+				 * (that the value is 0)
+				 */
+				targs->iterations++;
+			}
+		} else if (targs->margs->op == op_dec_and_test) {
+			if (ATOMIC_DEC_AND_TEST(targs->margs->width,
+							targs->atomic)) {
+				/*
+				 * Only increment if atomic op returns true
+				 * (that the value is 0)
+				 */
+				targs->iterations++;
+			}
+		} else if (targs->margs->op == op_sub_and_test) {
+			if (ATOMIC_SUB_AND_TEST(targs->margs->width,
+						targs->margs->value,
+						targs->atomic)) {
+				/*
+				 * Only increment if atomic op returns true
+				 * (that the value is 0)
+				 */
+				targs->iterations++;
+			}
+		} else if (targs->margs->op == op_add_unless) {
+			if (ATOMIC_ADD_UNLESS(targs->margs->width,
+					targs->atomic, targs->margs->value,
+					targs->unless) != targs->unless) {
+				/*
+				 * Increment until the atomic value is the
+				 * "unless" value.
+				 */
+				targs->iterations++;
+			}
+		} else {
+			/*
+			 * Don't print an error here because it would print
+			 * for each thread. The main thread will catch this.
+			 */
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Support function to make sure the threaded arithmetic tests ran the correct
+ * number of iterations across threads, if applicable.
+ */
+static bool correct_thread_iteration_count(struct unit_module *m,
+					   struct atomic_thread_info *threads,
+					   int num_threads,
+					   long expected_iterations)
+{
+	int i;
+	long total_iterations = 0;
+
+	for (i = 0; i < num_threads; i++) {
+		total_iterations += threads[i].iterations;
+	}
+
+	if (total_iterations != expected_iterations) {
+		unit_err(m, "threaded test op took wrong number of iterations "
+			 "expected %ld took: %ld\n",
+			 expected_iterations, total_iterations);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Test arithmetic operations in threads to verify atomicity.
+ *
+ * Sets initial start value
+ * Kicks off threads to loop running ops
+ * When threads finish loops, verify values
+ *
+ * With the ops that have a return, save the final value for each thread and
+ * use that to try to ensure that the threads aren't executing sequentially.
+ */
+static int test_atomic_arithmetic_threaded(struct unit_module *m,
+					struct gk20a *g, void *__args)
+{
+	struct atomic_test_args *args = (struct atomic_test_args *)__args;
+	struct atomic_struct atomic;
+	const int num_threads = 100;
+	struct atomic_thread_info threads[num_threads];
+	int i;
+	long expected_val, val, expected_iterations;
+
+	if (single_set_and_read(m, &atomic, args->width, args->start_val)
+							!= UNIT_SUCCESS) {
+		return UNIT_FAIL;
+	}
+
+	/* setup threads */
+	for (i = 0; i < num_threads; i++) {
+		threads[i].atomic = &atomic;
+		threads[i].margs = args;
+		threads[i].thread_num = i;
+		threads[i].iterations = 0;
+		/* For add_unless, add until we hit half the iterations */
+		threads[i].unless = args->start_val +
+				(num_threads * args->loop_count / 2);
+	}
+	/*
+	 * start threads - This is done separately to try to increase
+	 * parallelism of the threads by starting them as closely together
+	 * as possible. It is also done in reverse to avoid compiler
+	 * optimization.
+	 */
+	for (i = (num_threads - 1); i >= 0; i--) {
+		pthread_create(&threads[i].thread, NULL, arithmetic_thread,
+				&threads[i]);
+	}
+
+	/* wait for all threads to complete */
+	for (i = 0; i < num_threads; i++) {
+		pthread_join(threads[i].thread, NULL);
+	}
+
+	val = ATOMIC_READ(args->width, &atomic);
+
+	switch (args->op) {
+		case op_add_unless:
+			/*
+			 * For add_unless, the threads increment their iteration
+			 * counts until the atomic reaches the unless value,
+			 * but continue calling the op in the loop to make sure
+			 * it doesn't actually add anymore.
+			 */
+			expected_iterations = (threads[0].unless -
+						args->start_val + 1) /
+						args->value;
+			if (!correct_thread_iteration_count(m, threads,
+					num_threads, expected_iterations)) {
+				return  UNIT_FAIL;
+			}
+			expected_val = threads[0].unless;
+			break;
+
+		case op_inc_and_test:
+		case op_dec_and_test:
+		case op_sub_and_test:
+			/*
+			 * The threads only increment when the atomic op
+			 * reports that it hit 0 which should only happen once.
+			 */
+			if (!correct_thread_iteration_count(m, threads,
+				num_threads, 1)) {
+				return  UNIT_FAIL;
+			}
+			/* fall through! */
+
+		case op_add:
+		case op_sub:
+		case op_inc:
+		case op_dec:
+			expected_val = args->start_val +
+				(args->loop_count * num_threads *
+					ATOMIC_OP_SIGN(args->op) * args->value);
+			break;
+
+		default:
+			unit_return_fail(m, "Test error: invalid op in %s\n",
+					__func__);
+
+	}
+
+	/* sanity check */
+	if ((args->width == WIDTH_32) &&
+	    ((expected_val > INT_MAX) || (expected_val < INT_MIN))) {
+		unit_return_fail(m, "Test error: invalid value in %s\n",
+				__func__);
+	}
+
+	if (val != expected_val) {
+		unit_return_fail(m, "threaded value incorrect "
+				"expected: %ld result: %ld\n",
+				expected_val, val);
+	}
+
+	if (args->op == op_add) {
+		/* sanity test that the threads aren't all sequential */
+		bool sequential = true;
+		for (i = 0; i < (num_threads - 1); i++) {
+			if (abs(threads[i].final_val - threads[i+1].final_val)
+							!= args->loop_count) {
+				sequential = false;
+				break;
+			}
+		}
+		if (sequential) {
+			unit_return_fail(m, "threads appear to have run "
+					 "sequentially!\n");
+		}
+	}
+
+	return UNIT_SUCCESS;
+}
+
+/*
+ * Test xchg op single threaded for proper functionality
+ *
+ * Loops calling xchg op with different values making sure the returned
+ * value is the last one written.
+ */
+static int test_atomic_xchg(struct unit_module *m,
+			    struct gk20a *g, void *__args)
+{
+	struct atomic_test_args *args = (struct atomic_test_args *)__args;
+	struct atomic_struct atomic;
+	int i;
+	long new_val, old_val, ret_val;
+
+	if (single_set_and_read(m, &atomic, args->width, args->start_val)
+							!= UNIT_SUCCESS) {
+		return UNIT_FAIL;
+	}
+
+	old_val = args->start_val;
+	for (i = 0; i < args->loop_count; i++) {
+		/*
+		 * alternate positive and negative values while increasing
+		 * based on the loop counter
+		 */
+		new_val = (i % 2 ? 1 : -1) * (args->start_val + i);
+		/* only a 32bit xchg op */
+		ret_val = ATOMIC_XCHG(args->width, &atomic, new_val);
+		if (ret_val != old_val) {
+			unit_return_fail(m, "xchg returned bad old val "
+					"Expected: %ld, Received: %ld\n",
+					old_val, ret_val);
+		}
+		old_val = new_val;
+	}
+
+	return UNIT_SUCCESS;
+}
+
+/*
+ * Test cmpxchg single threaded for proper functionality
+ *
+ * Loop calling cmpxchg. Alternating between matching and not matching.
+ * Verify correct behavior for each call.
+ */
+static int test_atomic_cmpxchg(struct unit_module *m,
+			       struct gk20a *g, void *__args)
+{
+	struct atomic_test_args *args = (struct atomic_test_args *)__args;
+	struct atomic_struct atomic;
+	const int switch_interval = 5;
+	int i;
+	long new_val, old_val, ret_val;
+	bool should_match = true;
+
+	if (single_set_and_read(m, &atomic, args->width, args->start_val)
+							!= UNIT_SUCCESS) {
+		return UNIT_FAIL;
+	}
+
+	old_val = args->start_val;
+	for (i = 0; i < args->loop_count; i++) {
+		/*
+		 * alternate whether the cmp should match each
+		 * switch_interval
+		 */
+		if ((i % switch_interval) == 0) {
+			should_match = !should_match;
+		}
+
+		new_val = args->start_val + i;
+		if (should_match) {
+			ret_val = ATOMIC_CMPXCHG(args->width, &atomic,
+						old_val, new_val);
+			if (ret_val != old_val) {
+				unit_return_fail(m,
+					"cmpxchg returned bad old val "
+					"Expected: %ld, Received: %ld\n",
+					old_val, ret_val);
+			}
+			ret_val = ATOMIC_READ(args->width, &atomic);
+			if (ret_val != new_val) {
+				unit_return_fail(m,
+					"cmpxchg did not update "
+					"Expected: %ld, Received: %ld\n",
+					new_val, ret_val);
+			}
+			old_val = new_val;
+		} else {
+			ret_val = ATOMIC_CMPXCHG(args->width, &atomic,
+						-1 * old_val, new_val);
+			if (ret_val != old_val) {
+				unit_return_fail(m,
+					"cmpxchg returned bad old val "
+					"Expected: %ld, Received: %ld\n",
+					old_val, ret_val);
+			}
+			ret_val = ATOMIC_READ(args->width, &atomic);
+			if (ret_val != old_val) {
+				unit_return_fail(m,
+					"cmpxchg should not have updated "
+					"Expected: %ld, Received: %ld\n",
+					old_val, ret_val);
+			}
+		}
+	}
+
+	return UNIT_SUCCESS;
+}
+
+/*
+ * Test add_unless op single threaded for proper functionality
+ *
+ * Note: there is only a 32-bit operation
+ *
+ * Loop through calling the operation. Alternating whether the add should
+ * occur or not (i.e. changing the "unless" value).
+ * Verify correct behavior for each operation.
+ */
+static int test_atomic_add_unless(struct unit_module *m,
+				  struct gk20a *g, void *__args)
+{
+	struct atomic_test_args *args = (struct atomic_test_args *)__args;
+	struct atomic_struct atomic;
+	const int switch_interval = 5;
+	int i;
+	int new_val, old_val, ret_val;
+	bool should_update = true;
+
+	if (single_set_and_read(m, &atomic, args->width, args->start_val)
+							!= UNIT_SUCCESS) {
+		return UNIT_FAIL;
+	}
+	old_val = args->start_val;
+	for (i = 0; i < args->loop_count; i++) {
+		/* alternate whether add should occur every switch_interval */
+		if ((i % switch_interval) == 0) {
+			should_update = !should_update;
+		}
+
+		if (should_update) {
+			/* This will fail to match and do the add */
+			ret_val = ATOMIC_ADD_UNLESS(args->width, &atomic,
+						args->value, old_val - 1);
+			if (ret_val != old_val) {
+				unit_return_fail(m,
+					"add_unless returned bad old val "
+					"Expected: %d, Received: %d\n",
+					old_val, ret_val);
+			}
+			new_val = old_val + args->value;
+			ret_val = ATOMIC_READ(args->width, &atomic);
+			if (ret_val != new_val) {
+				unit_return_fail(m, "add_unless did not "
+						"update Expected: %d, "
+						"Received: %d\n",
+						new_val, ret_val);
+			}
+			old_val = ret_val;
+		} else {
+			/* This will match the old value and won't add */
+			ret_val = ATOMIC_ADD_UNLESS(args->width, &atomic,
+						args->value, old_val);
+			if (ret_val != old_val) {
+				unit_return_fail(m,
+					"add_unless returned bad old val "
+					"Expected: %d, Received: %d\n",
+					old_val, ret_val);
+			}
+			ret_val = ATOMIC_READ(args->width, &atomic);
+			if (ret_val != old_val) {
+				unit_return_fail(m, "add_unless should not "
+						"have updated Expected: %d, "
+						"Received: %d\n",
+						old_val, ret_val);
+			}
+		}
+	}
+
+	return UNIT_SUCCESS;
+}
+
+static struct atomic_test_args set_and_read_32_arg = {
+	.width = WIDTH_32,
+};
+static struct atomic_test_args set_and_read_64_arg = {
+	.width = WIDTH_64,
+};
+static struct atomic_test_args inc_32_arg = {
+	.op = op_inc,
+	.width = WIDTH_32,
+	.start_val = -500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args inc_and_test_32_arg = {
+	/* must cross 0 */
+	.op = op_inc_and_test,
+	.width = WIDTH_32,
+	.start_val = -500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args inc_and_test_64_arg = {
+	/* must cross 0 */
+	.op = op_inc_and_test,
+	.width = WIDTH_64,
+	.start_val = -500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args inc_64_arg = {
+	.op = op_inc,
+	.width = WIDTH_64,
+	.start_val = INT_MAX - 500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args dec_32_arg = {
+	.op = op_dec,
+	.width = WIDTH_32,
+	.start_val = 500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args dec_and_test_32_arg = {
+	/* must cross 0 */
+	.op = op_dec_and_test,
+	.width = WIDTH_32,
+	.start_val = 500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args dec_and_test_64_arg = {
+	/* must cross 0 */
+	.op = op_dec_and_test,
+	.width = WIDTH_64,
+	.start_val = 500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args dec_64_arg = {
+	.op = op_dec,
+	.width = WIDTH_64,
+	.start_val = INT_MIN + 500,
+	.loop_count = 10000,
+	.value = 1,
+};
+static struct atomic_test_args add_32_arg = {
+	.op = op_add,
+	.width = WIDTH_32,
+	.start_val = -500,
+	.loop_count = 10000,
+	.value = 7,
+};
+static struct atomic_test_args add_64_arg = {
+	.op = op_add,
+	.width = WIDTH_64,
+	.start_val = INT_MAX - 500,
+	.loop_count = 10000,
+	.value = 7,
+};
+struct atomic_test_args sub_32_arg = {
+	.op = op_sub,
+	.width = WIDTH_32,
+	.start_val = 500,
+	.loop_count = 10000,
+	.value = 7,
+};
+static struct atomic_test_args sub_64_arg = {
+	.op = op_sub,
+	.width = WIDTH_64,
+	.start_val = INT_MIN + 500,
+	.loop_count = 10000,
+	.value = 7,
+};
+static struct atomic_test_args sub_and_test_32_arg = {
+	/* must cross 0 */
+	.op = op_sub_and_test,
+	.width = WIDTH_32,
+	.start_val = 500,
+	.loop_count = 10000,
+	.value = 5,
+};
+static struct atomic_test_args sub_and_test_64_arg = {
+	/* must cross 0 */
+	.op = op_sub_and_test,
+	.width = WIDTH_64,
+	.start_val = 500,
+	.loop_count = 10000,
+	.value = 5,
+};
+struct atomic_test_args xchg_32_arg = {
+	.width = WIDTH_32,
+	.start_val = 1,
+	.loop_count = 10000,
+};
+struct atomic_test_args xchg_64_arg = {
+	.width = WIDTH_64,
+	.start_val = INT_MAX,
+	.loop_count = 10000,
+};
+static struct atomic_test_args add_unless_32_arg = {
+	/* must loop at least 10 times */
+	.op = op_add_unless,
+	.width = WIDTH_32,
+	.start_val = -500,
+	.loop_count = 10000,
+	.value = 5,
+};
+static struct atomic_test_args add_unless_64_arg = {
+	/* must loop at least 10 times */
+	.op = op_add_unless,
+	.width = WIDTH_64,
+	.start_val = -500,
+	.loop_count = 10000,
+	.value = 5,
+};
+
+struct unit_module_test atomic_tests[] = {
+	UNIT_TEST(atomic_set_and_read_32,		test_atomic_set_and_read,		&set_and_read_32_arg),
+	UNIT_TEST(atomic_set_and_read_64,		test_atomic_set_and_read,		&set_and_read_64_arg),
+	UNIT_TEST(atomic_inc_32,			test_atomic_arithmetic,			&inc_32_arg),
+	UNIT_TEST(atomic_inc_and_test_32,		test_atomic_arithmetic,			&inc_and_test_32_arg),
+	UNIT_TEST(atomic_inc_and_test_64,		test_atomic_arithmetic,			&inc_and_test_64_arg),
+	UNIT_TEST(atomic_inc_64,			test_atomic_arithmetic,			&inc_64_arg),
+	UNIT_TEST(atomic_dec_32,			test_atomic_arithmetic,			&dec_32_arg),
+	UNIT_TEST(atomic_dec_64,			test_atomic_arithmetic,			&dec_64_arg),
+	UNIT_TEST(atomic_dec_and_test_32,		test_atomic_arithmetic,			&dec_and_test_32_arg),
+	UNIT_TEST(atomic_dec_and_test_64,		test_atomic_arithmetic,			&dec_and_test_64_arg),
+	UNIT_TEST(atomic_add_32,			test_atomic_arithmetic,			&add_32_arg),
+	UNIT_TEST(atomic_add_64,			test_atomic_arithmetic,			&add_64_arg),
+	UNIT_TEST(atomic_sub_32,			test_atomic_arithmetic,			&sub_32_arg),
+	UNIT_TEST(atomic_sub_64,			test_atomic_arithmetic,			&sub_64_arg),
+	UNIT_TEST(atomic_sub_and_test_32,		test_atomic_arithmetic,			&sub_and_test_32_arg),
+	UNIT_TEST(atomic_sub_and_test_64,		test_atomic_arithmetic,			&sub_and_test_64_arg),
+	UNIT_TEST(atomic_xchg_32,			test_atomic_xchg,			&xchg_32_arg),
+	UNIT_TEST(atomic_xchg_64,			test_atomic_xchg,			&xchg_64_arg),
+	UNIT_TEST(atomic_cmpxchg_32,			test_atomic_cmpxchg,			&xchg_32_arg),
+	UNIT_TEST(atomic_cmpxchg_64,			test_atomic_cmpxchg,			&xchg_64_arg),
+	UNIT_TEST(atomic_add_unless_32,			test_atomic_add_unless,			&add_unless_32_arg),
+	UNIT_TEST(atomic_add_unless_64,			test_atomic_add_unless,			&add_unless_64_arg),
+	UNIT_TEST(atomic_inc_32_threaded,		test_atomic_arithmetic_threaded,	&inc_32_arg),
+	UNIT_TEST(atomic_inc_64_threaded,		test_atomic_arithmetic_threaded,	&inc_64_arg),
+	UNIT_TEST(atomic_dec_32_threaded,		test_atomic_arithmetic_threaded,	&dec_32_arg),
+	UNIT_TEST(atomic_dec_64_threaded,		test_atomic_arithmetic_threaded,	&dec_64_arg),
+	UNIT_TEST(atomic_add_32_threaded,		test_atomic_arithmetic_threaded,	&add_32_arg),
+	UNIT_TEST(atomic_add_64_threaded,		test_atomic_arithmetic_threaded,	&add_64_arg),
+	UNIT_TEST(atomic_sub_32_threaded,		test_atomic_arithmetic_threaded,	&sub_32_arg),
+	UNIT_TEST(atomic_sub_64_threaded,		test_atomic_arithmetic_threaded,	&sub_64_arg),
+	UNIT_TEST(atomic_inc_and_test_32_threaded,	test_atomic_arithmetic_threaded,	&inc_and_test_32_arg),
+	UNIT_TEST(atomic_inc_and_test_64_threaded,	test_atomic_arithmetic_threaded,	&inc_and_test_64_arg),
+	UNIT_TEST(atomic_dec_and_test_32_threaded,	test_atomic_arithmetic_threaded,	&dec_and_test_32_arg),
+	UNIT_TEST(atomic_dec_and_test_64_threaded,	test_atomic_arithmetic_threaded,	&dec_and_test_64_arg),
+	UNIT_TEST(atomic_sub_and_test_32_threaded,	test_atomic_arithmetic_threaded,	&sub_and_test_32_arg),
+	UNIT_TEST(atomic_sub_and_test_64_threaded,	test_atomic_arithmetic_threaded,	&sub_and_test_64_arg),
+	UNIT_TEST(atomic_add_unless_32_threaded,	test_atomic_arithmetic_threaded,	&add_unless_32_arg),
+	UNIT_TEST(atomic_add_unless_64_threaded,	test_atomic_arithmetic_threaded,	&add_unless_64_arg),
+
+};
+
+UNIT_MODULE(atomic, atomic_tests, UNIT_PRIO_POSIX_TEST);