bdnz 8b
blr
-_GLOBAL(bcopy)
- mr r6,r3
- mr r3,r4
- mr r4,r6
- b .memcpy
-
-/*
- * This version uses dcbz on the complete cache lines in the
- * destination area to reduce memory traffic. This requires that
- * the destination area is cacheable.
- * We only use this version if the source and dest don't overlap.
- * -- paulus.
- */
-_GLOBAL(cacheable_memcpy)
- add r7,r3,r5 /* test if the src & dst overlap */
- add r8,r4,r5
- cmplw 0,r4,r7
- cmplw 1,r3,r8
- crand 0,0,4 /* cr0.lt &= cr1.lt */
- blt .memcpy /* if regions overlap */
-
- addi r4,r4,-4
- addi r6,r3,-4
- neg r0,r3
- andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
- beq 58f
-
- cmplw 0,r5,r0 /* is this more than total to do? */
- blt 63f /* if not much to do */
- andi. r8,r0,3 /* get it word-aligned first */
- subf r5,r0,r5
- mtctr r8
- beq+ 61f
-70: lbz r9,4(r4) /* do some bytes */
- stb r9,4(r6)
- addi r4,r4,1
- addi r6,r6,1
- bdnz 70b
-61: srwi. r0,r0,2
- mtctr r0
- beq 58f
-72: lwzu r9,4(r4) /* do some words */
- stwu r9,4(r6)
- bdnz 72b
-
-58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
- clrlwi r5,r5,32-LG_CACHELINE_BYTES
- li r11,4
- mtctr r0
- beq 63f
-53:
- dcbz r11,r6
- COPY_16_BYTES
-#if CACHE_LINE_SIZE >= 32
- COPY_16_BYTES
-#if CACHE_LINE_SIZE >= 64
- COPY_16_BYTES
- COPY_16_BYTES
-#if CACHE_LINE_SIZE >= 128
- COPY_16_BYTES
- COPY_16_BYTES
- COPY_16_BYTES
- COPY_16_BYTES
-#endif
-#endif
-#endif
- bdnz 53b
-
-63: srwi. r0,r5,2
- mtctr r0
- beq 64f
-30: lwzu r0,4(r4)
- stwu r0,4(r6)
- bdnz 30b
-
-64: andi. r0,r5,3
- mtctr r0
- beq+ 65f
-40: lbz r0,4(r4)
- stb r0,4(r6)
- addi r4,r4,1
- addi r6,r6,1
- bdnz 40b
-65: blr
-
_GLOBAL(memmove)
cmplw 0,r3,r4
bgt .backwards_memcpy