Common Source Code Project for Qt (a.k.a for FM-7).
Révision | bd69099c7c3486b13e11aecc425870b4b1987dc3 (tree) |
---|---|
l'heure | 2017-05-27 20:10:56 |
Auteur | K.Ohta <whatisthis.sowhat@gmai...> |
Commiter | K.Ohta |
[COMMON] my_memcpy: With some HOST CPU i.e. x86, GCC use unaligend SIMD instructions to read/write memory.
@@ -195,8 +195,10 @@ int DLL_PREFIX my_vstprintf_s(_TCHAR *buffer, size_t numberOfElements, const _TC | ||
195 | 195 | |
196 | 196 | void DLL_PREFIX *my_memcpy(void *dst, void *src, size_t len) |
197 | 197 | { |
198 | - register size_t len1; | |
198 | + size_t len1; | |
199 | 199 | register size_t len2; |
200 | + register uint32_t s_align = (uint32_t)(((size_t)src) & 0x1f); | |
201 | + register uint32_t d_align = (uint32_t)(((size_t)dst) & 0x1f); | |
200 | 202 | int i; |
201 | 203 | |
202 | 204 | if(len == 0) return dst; |
@@ -204,9 +206,9 @@ void DLL_PREFIX *my_memcpy(void *dst, void *src, size_t len) | ||
204 | 206 | return memcpy(dst, src, len); |
205 | 207 | } |
206 | 208 | len1 = len; |
207 | - size_t s_align = ((size_t)src) & 0x1f; | |
208 | - size_t d_align = ((size_t)dst) & 0x1f; | |
209 | -#if 1 | |
209 | + | |
210 | +#if defined(WITHOUT_UNALIGNED_SIMD) | |
211 | +// Using SIMD without un-aligned instructions. | |
210 | 212 | switch(s_align) { |
211 | 213 | case 0: // Align 256 |
212 | 214 | { |
@@ -526,59 +528,56 @@ void DLL_PREFIX *my_memcpy(void *dst, void *src, size_t len) | ||
526 | 528 | break; |
527 | 529 | } |
528 | 530 | |
529 | -#else | |
530 | - // Check align(preamble) | |
531 | - if(((size_t)s & 0x0f) != 0) { // Src not align 16 | |
532 | - if(((size_t)s & 0x07) != 0) { // Src not Align 8 | |
533 | - return memcpy(d, s, len1); | |
534 | - } else { // Align 8 (at least src) | |
535 | - if(((size_t)d & 0x07) != 0) { // Dst not align 8 | |
536 | - return memcpy(d, s, len1); | |
537 | - } | |
538 | -__src_dst_align_8: | |
539 | - uint32_t b64[2]; | |
540 | - register uint32_t *s64 = (uint32_t *)s; | |
541 | - register uint32_t *d64 = (uint32_t *)d; | |
542 | - | |
543 | - // Src and Dst align 8 (at least) | |
544 | - len2 = len1 >> 3; | |
545 | - i = 0; | |
546 | - while(len2 > 0) { | |
547 | - for(i = 0; i < 2; i++) b64[i] = s64[i]; | |
548 | - for(i = 0; i < 2; i++) d64[i] = b64[i]; | |
549 | - s64 += 2; | |
550 | - d64 += 2; | |
551 | - --len2; | |
552 | - } | |
553 | - len1 = len1 & 7; | |
554 | - if(len1 != 0) return memcpy((uint8_t *)d64, (uint8_t *)s64, len1); | |
555 | - return dst; | |
556 | - } | |
557 | - } else { // Src align 16 | |
558 | - if(((size_t)d & 0x0f) != 0) { // Dst not align 16 | |
559 | - if(((size_t)d & 0x07) != 0) { // Dst not align 8 | |
560 | - return memcpy(d, s, len1); | |
561 | - } | |
562 | - // Dst align 8 | |
563 | - goto __src_dst_align_8; | |
564 | - } else { // Src and Dst align 16 | |
565 | -__src_dst_align_16: | |
566 | - len2 = len1 >> 4; | |
567 | - uint32_t b128[4]; | |
568 | - register uint32_t *s128 = (uint32_t *)s; | |
569 | - register uint32_t *d128 = (uint32_t *)d; | |
570 | - while(len2 > 0) { | |
571 | - for(i = 0; i < 4; i++) b128[i] = s128[i]; | |
572 | - for(i = 0; i < 4; i++) d128[i] = b128[i]; | |
573 | - s128 += 4; | |
574 | - d128 += 4; | |
575 | - --len2; | |
576 | - } | |
577 | - len1 = len1 & 0x0f; | |
578 | - if(len1 != 0) return memcpy((uint8_t *)d128, (uint8_t *)s128, len1); | |
579 | - return dst; | |
531 | +#else | |
532 | +// Using SIMD *with* un-aligned instructions. | |
533 | + register uint32_t *s32 = (uint32_t *)src; | |
534 | + register uint32_t *d32 = (uint32_t *)dst; | |
535 | + if(((s_align & 0x07) != 0x0) && ((d_align & 0x07) != 0x0)) { // None align. | |
536 | + return memcpy(dst, src, len); | |
537 | + } | |
538 | + if((s_align == 0x0) || (d_align == 0x0)) { // Align to 256bit | |
539 | + uint32_t b256[8]; | |
540 | + len2 = len1 >> 5; | |
541 | + while(len2 > 0) { | |
542 | + for(i = 0; i < 8; i++) b256[i] = s32[i]; | |
543 | + for(i = 0; i < 8; i++) d32[i] = b256[i]; | |
544 | + s32 += 8; | |
545 | + d32 += 8; | |
546 | + --len2; | |
580 | 547 | } |
548 | + len1 = len1 & 0x1f; | |
549 | + if(len1 != 0) return memcpy(d32, s32, len1); | |
550 | + return dst; | |
581 | 551 | } |
552 | + if(((s_align & 0x0f) == 0x0) || ((d_align & 0x0f) == 0x0)) { // Align to 128bit | |
553 | + uint32_t b128[4]; | |
554 | + len2 = len1 >> 4; | |
555 | + while(len2 > 0) { | |
556 | + for(i = 0; i < 4; i++) b128[i] = s32[i]; | |
557 | + for(i = 0; i < 4; i++) d32[i] = b128[i]; | |
558 | + s32 += 4; | |
559 | + d32 += 4; | |
560 | + --len2; | |
561 | + } | |
562 | + len1 = len1 & 0x0f; | |
563 | + if(len1 != 0) return memcpy(d32, s32, len1); | |
564 | + return dst; | |
565 | + } | |
566 | + if(((s_align & 0x07) == 0x0) || ((d_align & 0x07) == 0x0)) { // Align to 64bit | |
567 | + uint32_t b64[2]; | |
568 | + len2 = len1 >> 3; | |
569 | + while(len2 > 0) { | |
570 | + for(i = 0; i < 2; i++) b64[i] = s32[i]; | |
571 | + for(i = 0; i < 2; i++) d32[i] = b64[i]; | |
572 | + s32 += 2; | |
573 | + d32 += 2; | |
574 | + --len2; | |
575 | + } | |
576 | + len1 = len1 & 0x07; | |
577 | + if(len1 != 0) return memcpy(d32, s32, len1); | |
578 | + return dst; | |
579 | + } | |
580 | + //if(len1 != 0) return memcpy(dst, src, len1); | |
582 | 581 | #endif |
583 | 582 | // Trap |
584 | 583 | return dst; |