Add atomics for Watcom/x86 as inline asm

Partially fixes Bugzilla #3758.
diff --git a/include/SDL_atomic.h b/include/SDL_atomic.h
index 6f68ebb..b7c9de2 100644
--- a/include/SDL_atomic.h
+++ b/include/SDL_atomic.h
@@ -125,6 +125,9 @@
 #elif (defined(__GNUC__) && !defined(__EMSCRIPTEN__)) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5120))
 /* This is correct for all CPUs when using GCC or Solaris Studio 12.1+. */
 #define SDL_CompilerBarrier()   __asm__ __volatile__ ("" : : : "memory")
+#elif defined(__WATCOMC__)
+extern _inline void SDL_CompilerBarrier (void);
+#pragma aux SDL_CompilerBarrier = "" parm [] modify exact [];
 #else
 #define SDL_CompilerBarrier()   \
 { SDL_SpinLock _tmp = 0; SDL_AtomicLock(&_tmp); SDL_AtomicUnlock(&_tmp); }
diff --git a/src/atomic/SDL_atomic.c b/src/atomic/SDL_atomic.c
index 28c25ae..67fcb99 100644
--- a/src/atomic/SDL_atomic.c
+++ b/src/atomic/SDL_atomic.c
@@ -52,6 +52,31 @@
 # endif
 #endif
 
+#if defined(__WATCOMC__) && defined(__386__)
+#define HAVE_WATCOM_ATOMICS
+extern _inline int _SDL_xchg_watcom(volatile int *a, int v);
+#pragma aux _SDL_xchg_watcom = \
+  "xchg [ecx], eax" \
+  parm [ecx] [eax] \
+  value [eax] \
+  modify exact [eax];
+
+extern _inline unsigned char _SDL_cmpxchg_watcom(volatile int *a, int newval, int oldval);
+#pragma aux _SDL_cmpxchg_watcom = \
+  "lock cmpxchg [edx], ecx" \
+  "setz al" \
+  parm [edx] [ecx] [eax] \
+  value [al] \
+  modify exact [eax];
+
+extern _inline int _SDL_xadd_watcom(volatile int *a, int v);
+#pragma aux _SDL_xadd_watcom = \
+  "lock xadd [ecx], eax" \
+  parm [ecx] [eax] \
+  value [eax] \
+  modify exact [eax];
+#endif /* __WATCOMC__ && __386__ */
+
 /*
   If any of the operations are not provided then we must emulate some
   of them. That means we need a nice implementation of spin locks
@@ -75,7 +100,7 @@
   Contributed by Bob Pendleton, bob@pendleton.com
 */
 
-#if !defined(HAVE_MSC_ATOMICS) && !defined(HAVE_GCC_ATOMICS) && !defined(__MACOSX__) && !defined(__SOLARIS__)
+#if !defined(HAVE_MSC_ATOMICS) && !defined(HAVE_GCC_ATOMICS) && !defined(__MACOSX__) && !defined(__SOLARIS__) && !defined(HAVE_WATCOM_ATOMICS)
 #define EMULATE_CAS 1
 #endif
 
@@ -105,6 +130,8 @@
 {
 #ifdef HAVE_MSC_ATOMICS
     return (_InterlockedCompareExchange((long*)&a->value, (long)newval, (long)oldval) == (long)oldval);
+#elif defined(HAVE_WATCOM_ATOMICS)
+    return (SDL_bool) _SDL_cmpxchg_watcom(&a->value, newval, oldval);
 #elif defined(HAVE_GCC_ATOMICS)
     return (SDL_bool) __sync_bool_compare_and_swap(&a->value, oldval, newval);
 #elif defined(__MACOSX__)  /* this is deprecated in 10.12 sdk; favor gcc atomics. */
@@ -136,6 +163,8 @@
     return (_InterlockedCompareExchange((long*)a, (long)newval, (long)oldval) == (long)oldval);
 #elif defined(HAVE_MSC_ATOMICS) && (!_M_IX86)
     return (_InterlockedCompareExchangePointer(a, newval, oldval) == oldval);
+#elif defined(HAVE_WATCOM_ATOMICS)
+    return (SDL_bool) _SDL_cmpxchg_watcom((int *)a, (long)newval, (long)oldval);
 #elif defined(HAVE_GCC_ATOMICS)
     return __sync_bool_compare_and_swap(a, oldval, newval);
 #elif defined(__MACOSX__) && defined(__LP64__)  /* this is deprecated in 10.12 sdk; favor gcc atomics. */
@@ -165,6 +194,8 @@
 {
 #ifdef HAVE_MSC_ATOMICS
     return _InterlockedExchange((long*)&a->value, v);
+#elif defined(HAVE_WATCOM_ATOMICS)
+    return _SDL_xchg_watcom(&a->value, v);
 #elif defined(HAVE_GCC_ATOMICS)
     return __sync_lock_test_and_set(&a->value, v);
 #elif defined(__SOLARIS__) && defined(_LP64)
@@ -187,6 +218,8 @@
     return (void *) _InterlockedExchange((long *)a, (long) v);
 #elif defined(HAVE_MSC_ATOMICS) && (!_M_IX86)
     return _InterlockedExchangePointer(a, v);
+#elif defined(HAVE_WATCOM_ATOMICS)
+    return (void *) _SDL_xchg_watcom((int *)a, (long)v);
 #elif defined(HAVE_GCC_ATOMICS)
     return __sync_lock_test_and_set(a, v);
 #elif defined(__SOLARIS__)
@@ -205,6 +238,8 @@
 {
 #ifdef HAVE_MSC_ATOMICS
     return _InterlockedExchangeAdd((long*)&a->value, v);
+#elif defined(HAVE_WATCOM_ATOMICS)
+    return _SDL_xadd_watcom(&a->value, v);
 #elif defined(HAVE_GCC_ATOMICS)
     return __sync_fetch_and_add(&a->value, v);
 #elif defined(__SOLARIS__)
diff --git a/src/atomic/SDL_spinlock.c b/src/atomic/SDL_spinlock.c
index 2f46228..aa68051 100644
--- a/src/atomic/SDL_spinlock.c
+++ b/src/atomic/SDL_spinlock.c
@@ -32,6 +32,16 @@
 #include <atomic.h>
 #endif
 
+#if defined(__WATCOMC__) && defined(__386__)
+SDL_COMPILE_TIME_ASSERT(locksize, 4==sizeof(SDL_SpinLock));
+extern _inline int _SDL_xchg_watcom(volatile int *a, int v);
+#pragma aux _SDL_xchg_watcom = \
+  "xchg [ecx], eax" \
+  parm [ecx] [eax] \
+  value [eax] \
+  modify exact [eax];
+#endif /* __WATCOMC__ && __386__ */
+
 /* This function is where all the magic happens... */
 SDL_bool
 SDL_AtomicTryLock(SDL_SpinLock *lock)
@@ -58,6 +68,9 @@
     SDL_COMPILE_TIME_ASSERT(locksize, sizeof(*lock) == sizeof(long));
     return (InterlockedExchange((long*)lock, 1) == 0);
 
+#elif defined(__WATCOMC__) && defined(__386__)
+    return _SDL_xchg_watcom(lock, 1) == 0;
+
 #elif HAVE_GCC_ATOMICS || HAVE_GCC_SYNC_LOCK_TEST_AND_SET
     return (__sync_lock_test_and_set(lock, 1) == 0);
 
@@ -119,6 +132,10 @@
     _ReadWriteBarrier();
     *lock = 0;
 
+#elif defined(__WATCOMC__) && defined(__386__)
+    SDL_CompilerBarrier ();
+    *lock = 0;
+
 #elif HAVE_GCC_ATOMICS || HAVE_GCC_SYNC_LOCK_TEST_AND_SET
     __sync_lock_release(lock);