1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
!
! Fast SH memcpy
!
! by Toshiyasu Morita (tm@netcom.com)
! hacked by J"orn Rernnecke (amylaar@cygnus.co.uk) ("o for o-umlaut)
!
! Entry: r4: destination pointer
! r5: source pointer
! r6: byte count
!
! Exit: r0: destination pointer
! r1-r7: trashed
!
! Notes: Usually one wants to do small reads and write a longword, but
! unfortunately it is difficult in some cases to concatanate bytes
! into a longword on the SH, so this does a longword read and small
! writes.
!
! This implementation makes two assumptions about how it is called:
!
! 1.: If the byte count is nonzero, the address of the last byte to be
! copied is unsigned greater than the address of the first byte to
! be copied. This could be easily swapped for a signed comparison,
! but the algorithm used needs some comparison.
!
! 2.: When there are two or three bytes in the last word of an 11-or-bore
! bytes memory chunk to b copied, the rest of the word can be read
! without size effects.
! This could be easily changed by increasing the minumum size of
! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
! however, this would cost a few extra cyles on average.
!
#include "asm.h"
ENTRY(memcpy)
#ifdef __LITTLE_ENDIAN__
! Little endian version copies with increasing addresses.
mov r4,r3 ! Save return value
mov #11,r0 ! Check if small number of bytes
cmp/hs r0,r6
! r6 becomes src end address
SL(bf, L_small, add r5,r6)
mov #1,r1
tst r1,r5 ! check if source even
SL(bt, L_even, mov r6,r7)
mov.b @r5+,r0 ! no, make it even.
mov.b r0,@r4
add #1,r4
L_even: tst r1,r4 ! check if destination is even
add #-3,r7
SL(bf, L_odddst, mov #2,r1)
tst r1,r4 ! check if destination is 4-byte aligned
mov r4,r0
SL(bt, L_al4dst, sub r5,r0)
mov.w @r5+,r2
mov.w r2,@r4
! add #2,r4 r4 is dead here.
L_al4dst:
tst r1,r5
bt L_al4both
mov.w @r5+,r1
swap.w r1,r1
add #-6,r0
add #-6,r7 ! r7 := src end address minus 9.
.align 2
L_2l_loop:
mov.l @r5+,r2 ! Read & write two longwords per iteration
xtrct r2,r1
mov.l r1,@(r0,r5)
cmp/hs r7,r5
mov.l @r5+,r1
xtrct r1,r2
mov.l r2,@(r0,r5)
bf L_2l_loop
add #-2,r5
bra L_cleanup
add #5,r0
L_al4both:
add #-4,r0
.align 2
L_al4both_loop:
mov.l @r5+,r4 ! Read longword, write longword per iteration
cmp/hs r7,r5
SL(bf, L_al4both_loop, mov.l r4,@(r0,r5))
bra L_cleanup
add #3,r0
L_odddst:
tst r1,r5
SL(bt, L_al4src, add #-1,r4)
mov.w @r5+,r0
mov.b r0,@(1,r4)
shlr8 r0
mov.b r0,@(2,r4)
add #2,r4
L_al4src:
.align 2
L_odd_loop:
mov.l @r5+,r0 ! Read longword, write byte, word, byte per iteration
cmp/hs r7,r5
mov.b r0,@(1,r4)
shlr8 r0
mov.w r0,@(2,r4)
shlr16 r0
mov.b r0,@(4,r4)
SL(bf, L_odd_loop, add #4,r4)
.align 2 ! avoid nop in more frequently executed code.
L_cleanup2:
mov r4,r0
sub r5,r0
L_cleanup:
cmp/eq r6,r5
bt L_ready
.align 2
L_cleanup_loop:
mov.b @r5+,r1
cmp/eq r6,r5
mov.b r1,@(r0,r5)
bf L_cleanup_loop
L_ready:
rts
mov r3,r0
L_small:
bra L_cleanup2
add #-1,r4
#else
! Big endian version copies with decreasing addresses.
mov r4,r0
add r6,r0
sub r4,r5
mov #11,r1
cmp/hs r1,r6
SL(bf, L_small,
add #-1,r5)
mov r5,r3
add r0,r3
shlr r3
SL(bt, L_even,
mov r4,r7)
mov.b @(r0,r5),r2
add #-1,r3
mov.b r2,@-r0
L_even:
tst #1,r0
add #-1,r5
SL(bf, L_odddst,
add #8,r7)
tst #2,r0
bt L_al4dst
add #-1,r3
mov.w @(r0,r5),r1
mov.w r1,@-r0
L_al4dst:
shlr r3
bt L_al4both
mov.w @(r0,r5),r1
swap.w r1,r1
add #4,r7
add #-4,r5
.align 2
L_2l_loop:
mov.l @(r0,r5),r2
xtrct r2,r1
mov.l r1,@-r0
cmp/hs r7,r0
mov.l @(r0,r5),r1
xtrct r1,r2
mov.l r2,@-r0
bt L_2l_loop
bra L_cleanup
add #5,r5
nop ! avoid nop in executed code.
L_al4both:
add #-2,r5
.align 2
L_al4both_loop:
mov.l @(r0,r5),r1
cmp/hs r7,r0
SL(bt, L_al4both_loop,
mov.l r1,@-r0)
bra L_cleanup
add #3,r5
nop ! avoid nop in executed code.
L_odddst:
shlr r3
bt L_al4src
mov.w @(r0,r5),r1
mov.b r1,@-r0
shlr8 r1
mov.b r1,@-r0
L_al4src:
add #-2,r5
.align 2
L_odd_loop:
mov.l @(r0,r5),r2
cmp/hs r7,r0
mov.b r2,@-r0
shlr8 r2
mov.w r2,@-r0
shlr16 r2
mov.b r2,@-r0
bt L_odd_loop
add #3,r5
L_cleanup:
L_small:
cmp/eq r4,r0
bt L_ready
add #1,r4
.align 2
L_cleanup_loop:
mov.b @(r0,r5),r2
cmp/eq r4,r0
mov.b r2,@-r0
bf L_cleanup_loop
L_ready:
rts
nop
#endif
|