使用neon协处理器代替cpu做memcpy的sample code--大块数据拷贝有利于减少cpu消耗
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/time.h>
#include <malloc.h>
static __inline__ void* _aligned_malloc(size_t size, size_t align)
{
void * malloc_ptr;
void * aligned_ptr;
/* Error if align is not a power of two. */
if(align & (align - 1))
{
// errno = EINVAL;
return ((void*) 0);
}
if(size == 0)
return ((void *) 0);
/* Assume malloc'd pointer is aligned at least to sizeof (void*). If necessary, add another sizeof (void*) to store the value returned by malloc. Effectively this enforces a minimum alignment
of sizeof double. */
if(align < 2 * sizeof(void *))
align = 2 * sizeof(void *);
malloc_ptr = malloc(size + align);
if(!malloc_ptr)
return ((void *) 0);
/* Align We have at least sizeof (void *) space below malloc'd ptr. */
aligned_ptr = (void *)(((size_t) malloc_ptr + align)
& ~((size_t)(align) - 1));
/* Store the original pointer just before p. */
((void **) aligned_ptr) [-1] = malloc_ptr;
return aligned_ptr;
}
static __inline__ void _aligned_free(void * aligned_ptr)
{
if(aligned_ptr)
free(((void **) aligned_ptr) [-1]);
}
void __attribute__ ((noinline)) memcpy_neon_pld(void *dest, const void *src, size_t n)
{
asm(
"NEONCopyPLD:\n"
" pld [r1, #0xC0]\n" //预取数据
" vldm r1!,{d0-d7}\n" //从参数一r0(src)加载8*8=64个单通道8位数据
" vstm r0!,{d0-d7}\n" //存储在目的地址r1(dst)中,同样是64个8位单通道8位数据
" subs r2,r2,#0x40\n" //循环跳转参数,每次减64,总共循环次数=row*col*4/64
" bgt NEONCopyPLD\n" //以前这里是bge,有问题。现在改成bgt。
);
}
int main(int argc, char *argv[])
{
struct timeval begin, end;
unsigned char *dest, *src;
unsigned i = 0, index = 0, total = 0, j,s;
int size[9] = {1024, 2*1024, 4*1024, 8*1024, 16*1024, 128*1024, 512*1024 ,1024*1024, 8*1024*1024};
if (argc > 1)
index = atoi(argv[1]);
for(s = 0; s <= sizeof(size)/sizeof(size[0]); s++)
{
printf("size: %dK\n",size[s]/1024);
dest = (unsigned char*)_aligned_malloc(size[s],0x40);
if (NULL == dest)
return 0;
src = (unsigned char*)_aligned_malloc(size[s],0x40);
if (NULL == src)
{
free(dest);
return 0;
}
memset(src, 0x44, size[s]);
memset(dest, 0x33, size[s]);
for (j = 0; j < 32; j++)
{
gettimeofday(&begin, NULL);
if (0 == index)
memcpy_neon_pld(dest, src, size[s]);
else
memcpy(dest, src, size[s]);
gettimeofday(&end, NULL);
for (i = 0; i < size[s]; i++)
{
if (src[i] != ((dest[i])))
{
printf("error %d src %d dest %d!\n", i, src[i], dest[i]);
// break;
}
}
if (i == size[s])
{
int tv = 1000000 * (end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec);
// printf("%d. %d us\n", j + 1, tv);
total += tv;
}
}
printf("average : %u us\n", total / 32);
}
_aligned_free(src);
_aligned_free(dest);
return 0;
}