1
linux/arch/blackfin/mach-bf561/atomic.S

927 lines
15 KiB
ArmAsm
Raw Normal View History

/*
* Copyright 2007-2008 Analog Devices Inc.
* Philippe Gerum <rpm@xenomai.org>
*
* Licensed under the GPL-2 or later.
*/
#include <linux/linkage.h>
#include <asm/blackfin.h>
#include <asm/cache.h>
#include <asm/asm-offsets.h>
#include <asm/rwlock.h>
#include <asm/cplb.h>
.text
.macro coreslot_loadaddr reg:req
\reg\().l = _corelock;
\reg\().h = _corelock;
.endm
.macro safe_testset addr:req, scratch:req
#if ANOMALY_05000477
cli \scratch;
testset (\addr);
sti \scratch;
#else
testset (\addr);
#endif
.endm
/*
* r0 = address of atomic data to flush and invalidate (32bit).
*
* Clear interrupts and return the old mask.
* We assume that no atomic data can span cachelines.
*
* Clobbers: r2:0, p0
*/
ENTRY(_get_core_lock)
r1 = -L1_CACHE_BYTES;
r1 = r0 & r1;
cli r0;
coreslot_loadaddr p0;
.Lretry_corelock:
safe_testset p0, r2;
if cc jump .Ldone_corelock;
SSYNC(r2);
jump .Lretry_corelock
.Ldone_corelock:
p0 = r1;
/* flush core internal write buffer before invalidate dcache */
CSYNC(r2);
flushinv[p0];
SSYNC(r2);
rts;
ENDPROC(_get_core_lock)
/*
* r0 = address of atomic data in uncacheable memory region (32bit).
*
* Clear interrupts and return the old mask.
*
* Clobbers: r0, p0
*/
ENTRY(_get_core_lock_noflush)
cli r0;
coreslot_loadaddr p0;
.Lretry_corelock_noflush:
safe_testset p0, r2;
if cc jump .Ldone_corelock_noflush;
SSYNC(r2);
jump .Lretry_corelock_noflush
.Ldone_corelock_noflush:
/*
* SMP kgdb runs into dead loop without NOP here, when one core
* single steps over get_core_lock_noflush and the other executes
* get_core_lock as a slave node.
*/
nop;
CSYNC(r2);
rts;
ENDPROC(_get_core_lock_noflush)
/*
* r0 = interrupt mask to restore.
* r1 = address of atomic data to flush and invalidate (32bit).
*
* Interrupts are masked on entry (see _get_core_lock).
* Clobbers: r2:0, p0
*/
ENTRY(_put_core_lock)
/* Write-through cache assumed, so no flush needed here. */
coreslot_loadaddr p0;
r1 = 0;
[p0] = r1;
SSYNC(r2);
sti r0;
rts;
ENDPROC(_put_core_lock)
#ifdef __ARCH_SYNC_CORE_DCACHE
ENTRY(___raw_smp_mark_barrier_asm)
[--sp] = rets;
[--sp] = ( r7:5 );
[--sp] = r0;
[--sp] = p1;
[--sp] = p0;
call _get_core_lock_noflush;
/*
* Calculate current core mask
*/
GET_CPUID(p1, r7);
r6 = 1;
r6 <<= r7;
/*
* Set bit of other cores in barrier mask. Don't change current core bit.
*/
p1.l = _barrier_mask;
p1.h = _barrier_mask;
r7 = [p1];
r5 = r7 & r6;
r7 = ~r6;
cc = r5 == 0;
if cc jump 1f;
r7 = r7 | r6;
1:
[p1] = r7;
SSYNC(r2);
call _put_core_lock;
p0 = [sp++];
p1 = [sp++];
r0 = [sp++];
( r7:5 ) = [sp++];
rets = [sp++];
rts;
ENDPROC(___raw_smp_mark_barrier_asm)
ENTRY(___raw_smp_check_barrier_asm)
[--sp] = rets;
[--sp] = ( r7:5 );
[--sp] = r0;
[--sp] = p1;
[--sp] = p0;
call _get_core_lock_noflush;
/*
* Calculate current core mask
*/
GET_CPUID(p1, r7);
r6 = 1;
r6 <<= r7;
/*
* Clear current core bit in barrier mask if it is set.
*/
p1.l = _barrier_mask;
p1.h = _barrier_mask;
r7 = [p1];
r5 = r7 & r6;
cc = r5 == 0;
if cc jump 1f;
r6 = ~r6;
r7 = r7 & r6;
[p1] = r7;
SSYNC(r2);
call _put_core_lock;
/*
* Invalidate the entire D-cache of current core.
*/
sp += -12;
call _resync_core_dcache
sp += 12;
jump 2f;
1:
call _put_core_lock;
2:
p0 = [sp++];
p1 = [sp++];
r0 = [sp++];
( r7:5 ) = [sp++];
rets = [sp++];
rts;
ENDPROC(___raw_smp_check_barrier_asm)
/*
* r0 = irqflags
* r1 = address of atomic data
*
* Clobbers: r2:0, p1:0
*/
_start_lock_coherent:
[--sp] = rets;
[--sp] = ( r7:6 );
r7 = r0;
p1 = r1;
/*
* Determine whether the atomic data was previously
* owned by another CPU (=r6).
*/
GET_CPUID(p0, r2);
r1 = 1;
r1 <<= r2;
r2 = ~r1;
r1 = [p1];
r1 >>= 28; /* CPU fingerprints are stored in the high nibble. */
r6 = r1 & r2;
r1 = [p1];
r1 <<= 4;
r1 >>= 4;
[p1] = r1;
/*
* Release the core lock now, but keep IRQs disabled while we are
* performing the remaining housekeeping chores for the current CPU.
*/
coreslot_loadaddr p0;
r1 = 0;
[p0] = r1;
/*
* If another CPU has owned the same atomic section before us,
* then our D-cached copy of the shared data protected by the
* current spin/write_lock may be obsolete.
*/
cc = r6 == 0;
if cc jump .Lcache_synced
/*
* Invalidate the entire D-cache of the current core.
*/
sp += -12;
call _resync_core_dcache
sp += 12;
.Lcache_synced:
SSYNC(r2);
sti r7;
( r7:6 ) = [sp++];
rets = [sp++];
rts
/*
* r0 = irqflags
* r1 = address of atomic data
*
* Clobbers: r2:0, p1:0
*/
_end_lock_coherent:
p1 = r1;
GET_CPUID(p0, r2);
r2 += 28;
r1 = 1;
r1 <<= r2;
r2 = [p1];
r2 = r1 | r2;
[p1] = r2;
r1 = p1;
jump _put_core_lock;
#endif /* __ARCH_SYNC_CORE_DCACHE */
/*
* r0 = &spinlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_spin_is_locked_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r3 = [p1];
cc = bittst( r3, 0 );
r3 = cc;
r1 = p1;
call _put_core_lock;
rets = [sp++];
r0 = r3;
rts;
ENDPROC(___raw_spin_is_locked_asm)
/*
* r0 = &spinlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_spin_lock_asm)
p1 = r0;
[--sp] = rets;
.Lretry_spinlock:
call _get_core_lock;
r1 = p1;
r2 = [p1];
cc = bittst( r2, 0 );
if cc jump .Lbusy_spinlock
#ifdef __ARCH_SYNC_CORE_DCACHE
r3 = p1;
bitset ( r2, 0 ); /* Raise the lock bit. */
[p1] = r2;
call _start_lock_coherent
#else
r2 = 1;
[p1] = r2;
call _put_core_lock;
#endif
rets = [sp++];
rts;
.Lbusy_spinlock:
/* We don't touch the atomic area if busy, so that flush
will behave like nop in _put_core_lock. */
call _put_core_lock;
SSYNC(r2);
r0 = p1;
jump .Lretry_spinlock
ENDPROC(___raw_spin_lock_asm)
/*
* r0 = &spinlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_spin_trylock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = p1;
r3 = [p1];
cc = bittst( r3, 0 );
if cc jump .Lfailed_trylock
#ifdef __ARCH_SYNC_CORE_DCACHE
bitset ( r3, 0 ); /* Raise the lock bit. */
[p1] = r3;
call _start_lock_coherent
#else
r2 = 1;
[p1] = r2;
call _put_core_lock;
#endif
r0 = 1;
rets = [sp++];
rts;
.Lfailed_trylock:
call _put_core_lock;
r0 = 0;
rets = [sp++];
rts;
ENDPROC(___raw_spin_trylock_asm)
/*
* r0 = &spinlock->lock
*
* Clobbers: r2:0, p1:0
*/
ENTRY(___raw_spin_unlock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
bitclr ( r2, 0 );
[p1] = r2;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _end_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
ENDPROC(___raw_spin_unlock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r2:0, p1:0
*/
ENTRY(___raw_read_lock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
.Lrdlock_try:
r1 = [p1];
r1 += -1;
[p1] = r1;
cc = r1 < 0;
if cc jump .Lrdlock_failed
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
.Lrdlock_failed:
r1 += 1;
[p1] = r1;
.Lrdlock_wait:
r1 = p1;
call _put_core_lock;
SSYNC(r2);
r0 = p1;
call _get_core_lock;
r1 = [p1];
cc = r1 < 2;
if cc jump .Lrdlock_wait;
jump .Lrdlock_try
ENDPROC(___raw_read_lock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_read_trylock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
cc = r1 <= 0;
if cc jump .Lfailed_tryrdlock;
r1 += -1;
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
r0 = 1;
rts;
.Lfailed_tryrdlock:
r1 = p1;
call _put_core_lock;
rets = [sp++];
r0 = 0;
rts;
ENDPROC(___raw_read_trylock_asm)
/*
* r0 = &rwlock->lock
*
* Note: Processing controlled by a reader lock should not have
* any side-effect on cache issues with the other core, so we
* just release the core lock and exit (no _end_lock_coherent).
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_read_unlock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
r1 += 1;
[p1] = r1;
r1 = p1;
call _put_core_lock;
rets = [sp++];
rts;
ENDPROC(___raw_read_unlock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_write_lock_asm)
p1 = r0;
r3.l = lo(RW_LOCK_BIAS);
r3.h = hi(RW_LOCK_BIAS);
[--sp] = rets;
call _get_core_lock;
.Lwrlock_try:
r1 = [p1];
r1 = r1 - r3;
#ifdef __ARCH_SYNC_CORE_DCACHE
r2 = r1;
r2 <<= 4;
r2 >>= 4;
cc = r2 == 0;
#else
cc = r1 == 0;
#endif
if !cc jump .Lwrlock_wait
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
.Lwrlock_wait:
r1 = p1;
call _put_core_lock;
SSYNC(r2);
r0 = p1;
call _get_core_lock;
r1 = [p1];
#ifdef __ARCH_SYNC_CORE_DCACHE
r1 <<= 4;
r1 >>= 4;
#endif
cc = r1 == r3;
if !cc jump .Lwrlock_wait;
jump .Lwrlock_try
ENDPROC(___raw_write_lock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_write_trylock_asm)
p1 = r0;
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
r2.l = lo(RW_LOCK_BIAS);
r2.h = hi(RW_LOCK_BIAS);
cc = r1 == r2;
if !cc jump .Lfailed_trywrlock;
#ifdef __ARCH_SYNC_CORE_DCACHE
r1 >>= 28;
r1 <<= 28;
#else
r1 = 0;
#endif
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _start_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
r0 = 1;
rts;
.Lfailed_trywrlock:
r1 = p1;
call _put_core_lock;
rets = [sp++];
r0 = 0;
rts;
ENDPROC(___raw_write_trylock_asm)
/*
* r0 = &rwlock->lock
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_write_unlock_asm)
p1 = r0;
r3.l = lo(RW_LOCK_BIAS);
r3.h = hi(RW_LOCK_BIAS);
[--sp] = rets;
call _get_core_lock;
r1 = [p1];
r1 = r1 + r3;
[p1] = r1;
r1 = p1;
#ifdef __ARCH_SYNC_CORE_DCACHE
call _end_lock_coherent
#else
call _put_core_lock;
#endif
rets = [sp++];
rts;
ENDPROC(___raw_write_unlock_asm)
/*
* r0 = ptr
* r1 = value
*
* Add a signed value to a 32bit word and return the new value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_update_asm)
p1 = r0;
r3 = r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r3 + r2;
[p1] = r3;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_update_asm)
/*
* r0 = ptr
* r1 = mask
*
* Clear the mask bits from a 32bit word and return the old 32bit value
* atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_clear_asm)
p1 = r0;
r3 = ~r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r2 & r3;
[p1] = r3;
r3 = r2;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_clear_asm)
/*
* r0 = ptr
* r1 = mask
*
* Set the mask bits into a 32bit word and return the old 32bit value
* atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_set_asm)
p1 = r0;
r3 = r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r2 | r3;
[p1] = r3;
r3 = r2;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_set_asm)
/*
* r0 = ptr
* r1 = mask
*
* XOR the mask bits with a 32bit word and return the old 32bit value
* atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_xor_asm)
p1 = r0;
r3 = r1;
[--sp] = rets;
call _get_core_lock;
r2 = [p1];
r3 = r2 ^ r3;
[p1] = r3;
r3 = r2;
r1 = p1;
call _put_core_lock;
r0 = r3;
rets = [sp++];
rts;
ENDPROC(___raw_atomic_xor_asm)
/*
* r0 = ptr
* r1 = mask
*
* Perform a logical AND between the mask bits and a 32bit word, and
* return the masked value. We need this on this architecture in
* order to invalidate the local cache before testing.
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_atomic_test_asm)
p1 = r0;
r3 = r1;
r1 = -L1_CACHE_BYTES;
r1 = r0 & r1;
p0 = r1;
/* flush core internal write buffer before invalidate dcache */
CSYNC(r2);
flushinv[p0];
SSYNC(r2);
r0 = [p1];
r0 = r0 & r3;
rts;
ENDPROC(___raw_atomic_test_asm)
/*
* r0 = ptr
* r1 = value
*
* Swap *ptr with value and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
#define __do_xchg(src, dst) \
p1 = r0; \
r3 = r1; \
[--sp] = rets; \
call _get_core_lock; \
r2 = src; \
dst = r3; \
r3 = r2; \
r1 = p1; \
call _put_core_lock; \
r0 = r3; \
rets = [sp++]; \
rts;
ENTRY(___raw_xchg_1_asm)
__do_xchg(b[p1] (z), b[p1])
ENDPROC(___raw_xchg_1_asm)
ENTRY(___raw_xchg_2_asm)
__do_xchg(w[p1] (z), w[p1])
ENDPROC(___raw_xchg_2_asm)
ENTRY(___raw_xchg_4_asm)
__do_xchg([p1], [p1])
ENDPROC(___raw_xchg_4_asm)
/*
* r0 = ptr
* r1 = new
* r2 = old
*
* Swap *ptr with new if *ptr == old and return the previous *ptr
* value atomically.
*
* Clobbers: r3:0, p1:0
*/
#define __do_cmpxchg(src, dst) \
[--sp] = rets; \
[--sp] = r4; \
p1 = r0; \
r3 = r1; \
r4 = r2; \
call _get_core_lock; \
r2 = src; \
cc = r2 == r4; \
if !cc jump 1f; \
dst = r3; \
1: r3 = r2; \
r1 = p1; \
call _put_core_lock; \
r0 = r3; \
r4 = [sp++]; \
rets = [sp++]; \
rts;
ENTRY(___raw_cmpxchg_1_asm)
__do_cmpxchg(b[p1] (z), b[p1])
ENDPROC(___raw_cmpxchg_1_asm)
ENTRY(___raw_cmpxchg_2_asm)
__do_cmpxchg(w[p1] (z), w[p1])
ENDPROC(___raw_cmpxchg_2_asm)
ENTRY(___raw_cmpxchg_4_asm)
__do_cmpxchg([p1], [p1])
ENDPROC(___raw_cmpxchg_4_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Set a bit in a 32bit word and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_set_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_set_asm
ENDPROC(___raw_bit_set_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Clear a bit in a 32bit word and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_clear_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_clear_asm
ENDPROC(___raw_bit_clear_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Toggle a bit in a 32bit word and return the old 32bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_toggle_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_xor_asm
ENDPROC(___raw_bit_toggle_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test-and-set a bit in a 32bit word and return the old bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_set_asm)
[--sp] = rets;
[--sp] = r1;
call ___raw_bit_set_asm
r1 = [sp++];
r2 = 1;
r2 <<= r1;
r0 = r0 & r2;
cc = r0 == 0;
if cc jump 1f
r0 = 1;
1:
rets = [sp++];
rts;
ENDPROC(___raw_bit_test_set_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test-and-clear a bit in a 32bit word and return the old bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_clear_asm)
[--sp] = rets;
[--sp] = r1;
call ___raw_bit_clear_asm
r1 = [sp++];
r2 = 1;
r2 <<= r1;
r0 = r0 & r2;
cc = r0 == 0;
if cc jump 1f
r0 = 1;
1:
rets = [sp++];
rts;
ENDPROC(___raw_bit_test_clear_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test-and-toggle a bit in a 32bit word,
* and return the old bit value atomically.
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_toggle_asm)
[--sp] = rets;
[--sp] = r1;
call ___raw_bit_toggle_asm
r1 = [sp++];
r2 = 1;
r2 <<= r1;
r0 = r0 & r2;
cc = r0 == 0;
if cc jump 1f
r0 = 1;
1:
rets = [sp++];
rts;
ENDPROC(___raw_bit_test_toggle_asm)
/*
* r0 = ptr
* r1 = bitnr
*
* Test a bit in a 32bit word and return its value.
* We need this on this architecture in order to invalidate
* the local cache before testing.
*
* Clobbers: r3:0, p1:0
*/
ENTRY(___raw_bit_test_asm)
r2 = r1;
r1 = 1;
r1 <<= r2;
jump ___raw_atomic_test_asm
ENDPROC(___raw_bit_test_asm)
/*
* r0 = ptr
*
* Fetch and return an uncached 32bit value.
*
* Clobbers: r2:0, p1:0
*/
ENTRY(___raw_uncached_fetch_asm)
p1 = r0;
r1 = -L1_CACHE_BYTES;
r1 = r0 & r1;
p0 = r1;
/* flush core internal write buffer before invalidate dcache */
CSYNC(r2);
flushinv[p0];
SSYNC(r2);
r0 = [p1];
rts;
ENDPROC(___raw_uncached_fetch_asm)