使用neon协处理器代替cpu做memcpy的sample code--大块数据拷贝有利于减少cpu消耗
#include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <string.h> #include <sys/time.h> #include <malloc.h> static __inline__ void* _aligned_malloc(size_t size, size_t align) { void * malloc_ptr; void * aligned_ptr; /* Error if align is not a power of two. */ if(align & (align - 1)) { // errno = EINVAL; return ((void*) 0); } if(size == 0) return ((void *) 0); /* Assume malloc'd pointer is aligned at least to sizeof (void*). If necessary, add another sizeof (void*) to store the value returned by malloc. Effectively this enforces a minimum alignment of sizeof double. */ if(align < 2 * sizeof(void *)) align = 2 * sizeof(void *); malloc_ptr = malloc(size + align); if(!malloc_ptr) return ((void *) 0); /* Align We have at least sizeof (void *) space below malloc'd ptr. */ aligned_ptr = (void *)(((size_t) malloc_ptr + align) & ~((size_t)(align) - 1)); /* Store the original pointer just before p. */ ((void **) aligned_ptr) [-1] = malloc_ptr; return aligned_ptr; } static __inline__ void _aligned_free(void * aligned_ptr) { if(aligned_ptr) free(((void **) aligned_ptr) [-1]); } void __attribute__ ((noinline)) memcpy_neon_pld(void *dest, const void *src, size_t n) { asm( "NEONCopyPLD:\n" " pld [r1, #0xC0]\n" //预取数据 " vldm r1!,{d0-d7}\n" //从参数一r0(src)加载8*8=64个单通道8位数据 " vstm r0!,{d0-d7}\n" //存储在目的地址r1(dst)中,同样是64个8位单通道8位数据 " subs r2,r2,#0x40\n" //循环跳转参数,每次减64,总共循环次数=row*col*4/64 " bgt NEONCopyPLD\n" //以前这里是bge,有问题。现在改成bgt。 ); } int main(int argc, char *argv[]) { struct timeval begin, end; unsigned char *dest, *src; unsigned i = 0, index = 0, total = 0, j,s; int size[9] = {1024, 2*1024, 4*1024, 8*1024, 16*1024, 128*1024, 512*1024 ,1024*1024, 8*1024*1024}; if (argc > 1) index = atoi(argv[1]); for(s = 0; s <= sizeof(size)/sizeof(size[0]); s++) { printf("size: %dK\n",size[s]/1024); dest = (unsigned char*)_aligned_malloc(size[s],0x40); if (NULL == dest) return 0; src = (unsigned char*)_aligned_malloc(size[s],0x40); if (NULL == src) { free(dest); return 0; } memset(src, 0x44, size[s]); memset(dest, 0x33, size[s]); for (j = 0; j < 32; j++) { gettimeofday(&begin, NULL); if (0 == index) memcpy_neon_pld(dest, src, size[s]); else memcpy(dest, src, size[s]); gettimeofday(&end, NULL); for (i = 0; i < size[s]; i++) { if (src[i] != ((dest[i]))) { printf("error %d src %d dest %d!\n", i, src[i], dest[i]); // break; } } if (i == size[s]) { int tv = 1000000 * (end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec); // printf("%d. %d us\n", j + 1, tv); total += tv; } } printf("average : %u us\n", total / 32); } _aligned_free(src); _aligned_free(dest); return 0; }