Modern CPUs have instructions that can operate with several words of
data at once. In order to allow compiler to take advantage of such
instructions, the memory copies need to be taking place from array to
array. This makes the memory copy less CPU bound.
Signed-off-by: Timo Kokkonen <kaapeli@itanic.dy.fi>
int main(int argc, char *argv[])
{
int iterations, j, k, count, size, mask, latcount;
int main(int argc, char *argv[])
{
int iterations, j, k, count, size, mask, latcount;
- unsigned long *buf, i, tmp = 0;
+ unsigned long *buf, i, tmp[8] = {0};
struct timeval start, end;
printf("Benchmark sequential access bandwidth "
struct timeval start, end;
printf("Benchmark sequential access bandwidth "
gettimeofday(&start, 0);
for (k = 0; k < count; k++) {
gettimeofday(&start, 0);
for (k = 0; k < count; k++) {
- buf[0] = tmp;
- for (i = 0; i < size; i++)
- tmp += buf[i];
+ buf[0] = tmp[0];
+ for (i = 0; i < size; i += 8) {
+ tmp[0] += buf[i];
+ tmp[1] += buf[i+1];
+ tmp[2] += buf[i+2];
+ tmp[3] += buf[i+3];
+ tmp[4] += buf[i+4];
+ tmp[5] += buf[i+5];
+ tmp[6] += buf[i+6];
+ tmp[7] += buf[i+7];
+ }
gettimeofday(&start, 0);
for (k = 0; k < latcount; k++) {
gettimeofday(&start, 0);
for (k = 0; k < latcount; k++) {
for (i = 0; i < size; i++)
for (i = 0; i < size; i++)
- tmp += buf[random() & mask];
+ tmp[0] += buf[random() & mask];
gettimeofday(&start, 0);
for (k = 0; k < count; k++)
gettimeofday(&start, 0);
for (k = 0; k < count; k++)
- for (i = 0; i < size; i++)
- buf[i] = i;
+ for (i = 0; i < size; i += 8) {
+ buf[i ] = tmp[0];
+ buf[i+1] = tmp[1];
+ buf[i+2] = tmp[2];
+ buf[i+3] = tmp[3];
+ buf[i+4] = tmp[4];
+ buf[i+5] = tmp[5];
+ buf[i+6] = tmp[6];
+ buf[i+7] = tmp[7];
+ }
gettimeofday(&end, 0);
printf("% 9.2f ", (double)(size * lsize) *
gettimeofday(&end, 0);
printf("% 9.2f ", (double)(size * lsize) *