使用neon协处理器代替cpu做memcpy的sample code--大块数据拷贝有利于减少cpu消耗

#include <stdio.h>

#include <stdlib.h>

#include <unistd.h>

#include <string.h>

#include <sys/time.h>

#include <malloc.h>

static __inline__ void* _aligned_malloc(size_t size, size_t align)
{
    void * malloc_ptr;
    void * aligned_ptr;

    /* Error if align is not a power of two.  */

if(align & (align - 1))
    {
//        errno = EINVAL;
        return ((void*) 0);
    }

    if(size == 0)
        return ((void *) 0);

    /* Assume malloc'd pointer is aligned at least to sizeof (void*).           If necessary, add another sizeof (void*) to store the value         returned by malloc. Effectively this enforces a minimum alignment
        of sizeof double. */
    if(align < 2 * sizeof(void *))
        align = 2 * sizeof(void *);

    malloc_ptr = malloc(size + align);

    if(!malloc_ptr)
        return ((void *) 0);

    /* Align  We have at least sizeof (void *) space below malloc'd ptr. */
    aligned_ptr = (void *)(((size_t) malloc_ptr + align)
                       & ~((size_t)(align) - 1));

    /* Store the original pointer just before p.  */
    ((void **) aligned_ptr) [-1] = malloc_ptr;

    return aligned_ptr;
}

static __inline__ void _aligned_free(void * aligned_ptr)
{
    if(aligned_ptr)
        free(((void **) aligned_ptr) [-1]);
}
void __attribute__ ((noinline)) memcpy_neon_pld(void *dest, const void *src, size_t n)
{
    asm(
    "NEONCopyPLD:\n"
    "   pld [r1, #0xC0]\n" //预取数据
    "   vldm r1!,{d0-d7}\n" //从参数一r0src加载8*8=64个单通道8位数据
    "   vstm r0!,{d0-d7}\n" //存储在目的地址r1dst同样是64个8位单通道8位数据
    "   subs r2,r2,#0x40\n" //循环跳转参数每次减64总共循环次数=row*col*4/64
    "   bgt NEONCopyPLD\n"  //以前这里是bge有问题现在改成bgt
    );
}

int main(int argc, char *argv[])
{
    struct timeval begin, end;
    unsigned char *dest, *src;
    unsigned i = 0, index = 0, total = 0, j,s;
    int size[9] = {1024, 2*1024, 4*1024, 8*1024, 16*1024, 128*1024, 512*1024 ,1024*1024, 8*1024*1024};

    if (argc > 1)
        index = atoi(argv[1]);

    for(s = 0; s <= sizeof(size)/sizeof(size[0]); s++)
    {

        printf("size: %dK\n",size[s]/1024);


        dest = (unsigned char*)_aligned_malloc(size[s],0x40);
        if (NULL == dest)
            return 0;

        src = (unsigned char*)_aligned_malloc(size[s],0x40);
        if (NULL == src)
        {
            free(dest);
            return 0;
        }

        memset(src, 0x44, size[s]);
        memset(dest, 0x33, size[s]);


        for (j = 0; j < 32; j++)
        {
            gettimeofday(&begin, NULL);
            if (0 == index)
                memcpy_neon_pld(dest, src, size[s]);
            else
                memcpy(dest, src, size[s]);
            gettimeofday(&end, NULL);


            for (i = 0; i < size[s]; i++)
            {
                if (src[i] != ((dest[i])))
                {
                    printf("error %d src %d dest %d!\n", i, src[i], dest[i]);
//                  break;
                }
            }

            if (i == size[s])
            {
                int tv = 1000000 * (end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec);
//              printf("%d. %d us\n", j + 1, tv);
                total += tv;
            }
        }

        printf("average : %u us\n", total / 32);


    }
    _aligned_free(src);
        _aligned_free(dest);
    return 0;
}