newlib/libc/machine/sh/memcpy.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223

!
! Fast SH memcpy
!
! by Toshiyasu Morita (tm@netcom.com)
! hacked by J"orn Rernnecke (amylaar@cygnus.co.uk) ("o for o-umlaut)
!
! Entry: r4: destination pointer
!        r5: source pointer
!        r6: byte count
!
! Exit:  r0: destination pointer
!        r1-r7: trashed
!
! Notes: Usually one wants to do small reads and write a longword, but
!        unfortunately it is difficult in some cases to concatanate bytes
!        into a longword on the SH, so this does a longword read and small
!        writes.
!
! This implementation makes two assumptions about how it is called:
!
! 1.: If the byte count is nonzero, the address of the last byte to be
!     copied is unsigned greater than the address of the first byte to
!     be copied.  This could be easily swapped for a signed comparison,
!     but the algorithm used needs some comparison.
!
! 2.: When there are two or three bytes in the last word of an 11-or-bore
!     bytes memory chunk to b copied, the rest of the word can be read
!     without size effects.
!     This could be easily changed by increasing the minumum size of
!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
!     however, this would cost a few extra cyles on average.
!

#include "asm.h"

ENTRY(memcpy)
#ifdef __LITTLE_ENDIAN__
	! Little endian version copies with increasing addresses.
	mov r4,r3	! Save return value
	mov #11,r0	! Check if small number of bytes
	cmp/hs r0,r6
			! r6 becomes src end address
	SL(bf, L_small, add r5,r6)
	mov #1,r1
	tst r1,r5	! check if source even
	SL(bt, L_even, mov r6,r7)
	mov.b @r5+,r0	! no, make it even.
	mov.b r0,@r4
	add #1,r4
L_even:	tst r1,r4	! check if destination is even
	add #-3,r7
	SL(bf, L_odddst, mov #2,r1)
	tst r1,r4	! check if destination is 4-byte aligned
	mov r4,r0
	SL(bt, L_al4dst, sub r5,r0)
	mov.w @r5+,r2
	mov.w r2,@r4
	! add #2,r4  r4 is dead here.
L_al4dst:
	tst r1,r5
	bt L_al4both
	mov.w @r5+,r1
	swap.w r1,r1
	add #-6,r0
	add #-6,r7	! r7 := src end address minus 9.
	.align 2
L_2l_loop:
	mov.l @r5+,r2 ! Read & write two longwords per iteration
	xtrct r2,r1
	mov.l r1,@(r0,r5)
	cmp/hs r7,r5
	mov.l @r5+,r1
	xtrct r1,r2
	mov.l r2,@(r0,r5)
	bf L_2l_loop
	add #-2,r5
	bra  L_cleanup
	add #5,r0
L_al4both:
	add #-4,r0
	.align 2
L_al4both_loop:
	mov.l @r5+,r4   ! Read longword, write longword per iteration
	cmp/hs r7,r5
	SL(bf, L_al4both_loop, mov.l r4,@(r0,r5))

	bra L_cleanup
	add #3,r0

L_odddst:
	tst r1,r5
	SL(bt, L_al4src, add #-1,r4)
	mov.w @r5+,r0
	mov.b r0,@(1,r4)
	shlr8 r0
	mov.b r0,@(2,r4)
	add #2,r4
L_al4src:
	.align 2
L_odd_loop:
	mov.l @r5+,r0   ! Read longword, write byte, word, byte per iteration
	cmp/hs r7,r5
	mov.b r0,@(1,r4)
	shlr8 r0
	mov.w r0,@(2,r4)
	shlr16 r0
	mov.b r0,@(4,r4)
	SL(bf, L_odd_loop, add #4,r4)
	.align 2 ! avoid nop in more frequently executed code.
L_cleanup2:
	mov	r4,r0
	sub	r5,r0
L_cleanup:
	cmp/eq	r6,r5
	bt	L_ready
	.align 2
L_cleanup_loop:
	mov.b	@r5+,r1
	cmp/eq	r6,r5
	mov.b	r1,@(r0,r5)
	bf	L_cleanup_loop
L_ready:
	rts
	mov	r3,r0
L_small:
	bra L_cleanup2
	add #-1,r4
#else
	! Big endian version copies with decreasing addresses.
	mov r4,r0
	add r6,r0
	sub r4,r5
	mov #11,r1
	cmp/hs r1,r6
	SL(bf, L_small,
	add #-1,r5)
	mov r5,r3
	add r0,r3
	shlr r3
	SL(bt, L_even,
	mov r4,r7)
	mov.b @(r0,r5),r2
	add #-1,r3
	mov.b r2,@-r0
L_even:
	tst #1,r0
	add #-1,r5
	SL(bf, L_odddst,
	add #8,r7)
	tst #2,r0
	bt L_al4dst
	add #-1,r3
	mov.w @(r0,r5),r1
	mov.w r1,@-r0
L_al4dst:
	shlr r3
	bt L_al4both
	mov.w @(r0,r5),r1
	swap.w r1,r1
	add #4,r7
	add #-4,r5
	.align 2
L_2l_loop:
	mov.l @(r0,r5),r2
	xtrct r2,r1
	mov.l r1,@-r0
	cmp/hs r7,r0
	mov.l @(r0,r5),r1
	xtrct r1,r2
	mov.l r2,@-r0
	bt L_2l_loop
	bra L_cleanup
	add #5,r5

	nop ! avoid nop in executed code.
L_al4both:
	add #-2,r5
	.align 2
L_al4both_loop:
	mov.l @(r0,r5),r1
	cmp/hs r7,r0
	SL(bt, L_al4both_loop,
	mov.l r1,@-r0)
	bra L_cleanup
	add #3,r5

	nop ! avoid nop in executed code.
L_odddst:
	shlr r3
	bt L_al4src
	mov.w @(r0,r5),r1
	mov.b r1,@-r0
	shlr8 r1
	mov.b r1,@-r0
L_al4src:
	add #-2,r5
	.align 2
L_odd_loop:
	mov.l @(r0,r5),r2
	cmp/hs r7,r0
	mov.b r2,@-r0
	shlr8 r2
	mov.w r2,@-r0
	shlr16 r2
	mov.b r2,@-r0
	bt L_odd_loop

	add #3,r5
L_cleanup:
L_small:
	cmp/eq r4,r0
	bt L_ready
	add #1,r4
	.align 2
L_cleanup_loop:
	mov.b @(r0,r5),r2
	cmp/eq r4,r0
	mov.b r2,@-r0
	bf L_cleanup_loop
L_ready:
	rts
	nop
#endif