Skip to content

Commit

Permalink
Reintroduce SSE fallback, for my poor Atom x5-Z8350. All hail the Ato…
Browse files Browse the repository at this point in the history
…mic PI!
  • Loading branch information
ttsiodras committed Jul 14, 2022
1 parent 48732b8 commit 497a59a
Show file tree
Hide file tree
Showing 6 changed files with 229 additions and 19 deletions.
17 changes: 10 additions & 7 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COMPILE/INSTALL/RUN
Windows
-------
Windows users can download and run a pre-compiled Windows binary
[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.9/mandelSSE-win32-2.9.zip).
[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.10/mandelSSE-win32-2.10.zip).

After decompressing, you can simply execute either one of the two .bat
files. The 'autopilot' one zooms in a specific location, while the other
Expand All @@ -32,12 +32,15 @@ You can then simply...
$ src/mandelSSE -h
Usage: ./src/mandelSSE [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]
Where:
-h Show this help message
-m Run in mouse-driven mode
-a Run in autopilot mode (default)
-b Run in benchmark mode (implies autopilot)
-f fps Enforce upper bound of frames per second (default: 60)
(use 0 to run at full possible speed)
-h Show this help message
-m Run in mouse-driven mode
-a Run in autopilot mode (default)
-b Run in benchmark mode (implies autopilot)
-v Force use of AVX
-s Force use of SSE
-d Force use of non-AVX, non-SSE code
-f fps Enforce upper bound of frames per second (default: 60)
(use 0 to run at full possible speed)

If WIDTH and HEIGHT are not provided, they default to: 1024 768

Expand Down
17 changes: 10 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COMPILE/INSTALL/RUN
Windows
-------
Windows users can download and run a pre-compiled Windows binary
[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.9/mandelSSE-win32-2.9.zip).
[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.10/mandelSSE-win32-2.10.zip).

After decompressing, you can simply execute either one of the two .bat
files. The 'autopilot' one zooms in a specific location, while the other
Expand All @@ -32,12 +32,15 @@ You can then simply...
$ src/mandelSSE -h
Usage: ./src/mandelSSE [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]
Where:
-h Show this help message
-m Run in mouse-driven mode
-a Run in autopilot mode (default)
-b Run in benchmark mode (implies autopilot)
-f fps Enforce upper bound of frames per second (default: 60)
(use 0 to run at full possible speed)
-h Show this help message
-m Run in mouse-driven mode
-a Run in autopilot mode (default)
-b Run in benchmark mode (implies autopilot)
-v Force use of AVX
-s Force use of SSE
-d Force use of non-AVX, non-SSE code
-f fps Enforce upper bound of frames per second (default: 60)
(use 0 to run at full possible speed)

If WIDTH and HEIGHT are not provided, they default to: 1024 768

Expand Down
34 changes: 30 additions & 4 deletions src/mandel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@

void usage(char *argv[])
{
printf("Usage: %s [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]\n", argv[0]);
printf("Usage: %s [-a|-m] [-h] [-b] [-v|-s|-d] [-f rate] [WIDTH HEIGHT]\n", argv[0]);
puts("Where:");
puts("\t-h\tShow this help message");
puts("\t-m\tRun in mouse-driven mode");
puts("\t-a\tRun in autopilot mode (default)");
puts("\t-b\tRun in benchmark mode (implies autopilot)");
puts("\t-v\tForce use of AVX");
puts("\t-s\tForce use of SSE");
puts("\t-d\tForce use of non-AVX, non-SSE code");
puts("\t-f fps\tEnforce upper bound of frames per second (default: 60)");
puts("\t \t(use 0 to run at full possible speed)\n");
puts("If WIDTH and HEIGHT are not provided, they default to: 1024 768");
Expand All @@ -42,8 +45,9 @@ int main(int argc, char *argv[])
{
int opt, fps = 60;
bool autoPilot = true, benchmark = false;
bool forceAVX = false, forceSSE = false, forceDefault = false;

while ((opt = getopt(argc, argv, "hmabf:")) != -1) {
while ((opt = getopt(argc, argv, "hmabvsdf:")) != -1) {
switch (opt) {
case 'h':
usage(argv);
Expand All @@ -58,6 +62,15 @@ int main(int argc, char *argv[])
autoPilot = true;
benchmark = true;
break;
case 'v':
forceAVX = true;
break;
case 's':
forceSSE = true;
break;
case 'd':
forceDefault = true;
break;
case 'f':
if (1 != sscanf(optarg, "%d", &fps))
panic("[x] Not a valid frame rate: '%s'", optarg);
Expand Down Expand Up @@ -112,8 +125,21 @@ int main(int argc, char *argv[])
else
printf("[-] FPS Limit: %d frames/sec\n", fps);
#ifdef __x86_64__
CoreLoopDouble = __builtin_cpu_supports("avx") ? CoreLoopDoubleAVX : CoreLoopDoubleDefault;
printf("[-] Mode: %s\n", __builtin_cpu_supports("avx") ? "AVX" : "non-AVX");
if (forceAVX)
CoreLoopDouble = CoreLoopDoubleAVX;
else if (forceSSE)
CoreLoopDouble = CoreLoopDoubleSSE;
else if (forceDefault)
CoreLoopDouble = CoreLoopDoubleDefault;
else
CoreLoopDouble =
__builtin_cpu_supports("avx") ? CoreLoopDoubleAVX
: __builtin_cpu_supports("sse") ? CoreLoopDoubleSSE
: CoreLoopDoubleDefault;
printf("[-] Mode: %s\n",
CoreLoopDouble == CoreLoopDoubleAVX ? "AVX"
: CoreLoopDouble == CoreLoopDoubleSSE ? "SSE"
: "non-AVX/non-SSE");
#else
CoreLoopDouble = CoreLoopDoubleDefault;
printf("[-] Mode: %s\n", "non-AVX");
Expand Down
177 changes: 177 additions & 0 deletions src/sse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,183 @@ void CoreLoopDoubleDefault(double xcur, double ycur, double xstep, unsigned char

#ifdef __x86_64__

void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p)
{
DECLARE_ALIGNED(16,double,re[2]);
DECLARE_ALIGNED(16,double,im[2]);
DECLARE_ALIGNED(16,unsigned,k1[2]);

DECLARE_ALIGNED(16,double,outputs[2]);

re[0] = xcur;
re[1] = (xcur + xstep);

im[0] = im[1] = ycur;

k1[0] = k1[1] = 0;
// x' = x^2 - y^2 + a
// y' = 2xy + b
//
asm("mov %6,%%ecx\n\t" // ecx is ITERA
"xor %%ebx, %%ebx\n\t" // period = 0
"movapd %3,%%xmm5\n\t" // 4. 4. ; xmm5
"movapd %1,%%xmm6\n\t" // a0 a1 ; xmm6
"movaps %2,%%xmm7\n\t" // b0 b1 ; xmm7
"xorpd %%xmm0,%%xmm0\n\t" // 0. 0. ; rez in xmm0
"xorpd %%xmm1,%%xmm1\n\t" // 0. 0. ; imz in xmm1
"xorpd %%xmm3,%%xmm3\n\t" // 0. 0. ; bailout counters
"xorpd %%xmm8,%%xmm8\n\t" // 0. 0. ; bailout counters
"xorpd %%xmm9,%%xmm9\n\t" // 0. 0. ; bailout counters

"1:\n\t" // Main Mandelbrot computation
"movapd %%xmm0,%%xmm2\n\t" // x0 x1 ; xmm2
"mulpd %%xmm1,%%xmm2\n\t" // x0*y0 x1*y1 ; xmm2
"mulpd %%xmm0,%%xmm0\n\t" // x0^2 x1^2 ; xmm0
"mulpd %%xmm1,%%xmm1\n\t" // y0^2 y1^2 ; xmm1
"movapd %%xmm0,%%xmm4\n\t" //
"addpd %%xmm1,%%xmm4\n\t" // x0^2+y0^2 x1... ; xmm4
"subpd %%xmm1,%%xmm0\n\t" // x0^2-y0^2 x1... ; xmm0
"addpd %%xmm6,%%xmm0\n\t" // x0' x1' ; xmm0
"movapd %%xmm2,%%xmm1\n\t" // x0*y0 x1*y1 ; xmm1
"addpd %%xmm1,%%xmm1\n\t" // 2x0*y0 2x1*y1 ; xmm1
"addpd %%xmm7,%%xmm1\n\t" // y0' y1' ; xmm1

"cmpltpd %%xmm5,%%xmm4\n\t" // <4 <4 ; xmm2
"movapd %%xmm4,%%xmm2\n\t" // xmm2 has all 1s in the non-overflowed pixels
"movmskpd %%xmm4,%%eax\n\t" // (lower 2 bits reflect comparisons)
"andpd %4,%%xmm4\n\t" // so, prepare to increase the non-overflowed (and with ones)
"addpd %%xmm4,%%xmm3\n\t" // by updating their counters

"or %%eax,%%eax\n\t" // have both pixels overflowed ?

"je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop
"dec %%ecx\n\t" // otherwise, repeat the loop ITERA times...
"jnz 22f\n\t" // but before redoing the loop, first do periodicity checking

// We've done the loop ITERA times.
// Set non-overflowed outputs to 0 (inside xmm3). Here's how:
"movapd %%xmm2,%%xmm4\n\t" // xmm4 has all 1s in the non-overflowed pixels...
"xorpd %5,%%xmm4\n\t" // xmm4 has all 1s in the overflowed pixels (toggled, via xoring with allbits)
"andpd %%xmm4,%%xmm3\n\t" // zero out the xmm3 parts that belong to non-overflowed (set to black)
"jmp 2f\n\t" // And jump to end of everything, where xmm3 is written into outputs

"22:\n\t" // Periodicity checking
"inc %%bl\n\t" // period++
"and $0xF, %%bl\n\t" // period &= 0xF
"jnz 11f\n\t" // if period is not zero, continue to check if we're seeing xold, yold again
"movapd %%xmm0, %%xmm8\n\t" // time to update xold[2], yold[2] - store xold[2] in xmm8
"movapd %%xmm1, %%xmm9\n\t" // and yold[2] in xmm9
"jmp 1b\n\t" // and jump back to the loop beginning

"11:\n\t" // are we seeing xold[2], yold[2] into our rez[2], imz[2]?
"movapd %%xmm8, %%xmm10\n\t" // the comparison instruction will modify the target XMM register, so use xmm10
"cmpeqpd %%xmm0, %%xmm10\n\t" // compare xmm10 (which now has xold[2]) with rez[2]. Set all 1s into xmm10 if equal
"movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison.
"or %%eax, %%eax\n\t" // are they BOTH zero?
"jz 1b\n\t" // Yes - so, neither of the two rez matched with the two xold. Repeat the loop
"movapd %%xmm9, %%xmm10\n\t" // Set xmm10 to contain yold[2]
"cmpeqpd %%xmm1, %%xmm10\n\t" // compare xmm10 with imz[2]. Set all 1s into xmm10 if equal
"movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison.
"or %%eax, %%eax\n\t" // are they BOTH zero?
"jz 1b\n\t" // Yes - so, neither of the two imz matched with the two yold. Repeat the loop
"xorpd %%xmm3,%%xmm3\n\t" // Repetition detected. Set both results to 0.0 (both pixels black)

"2:\n\t"
"movapd %%xmm3,%0\n\t"
:"=m"(outputs[0])
:"m"(re[0]),"m"(im[0]),"m"(fours[0]),"m"(ones[0]),"m"(allbits[0]),"i"(ITERA)
:"%eax","%ebx","%ecx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","memory");

int tmp = (int)(outputs[0]);
*(*p)++ = tmp;
tmp = (int)(outputs[1]);
*(*p)++ = tmp;

re[0] = xcur + 2*xstep;
re[1] = xcur + 3*xstep;

im[0] = im[1] = ycur;

k1[0] = k1[1] = 0;
// x' = x^2 - y^2 + a
// y' = 2xy + b
//
asm("mov %6,%%ecx\n\t" // ecx is ITERA
"xor %%ebx, %%ebx\n\t" // period = 0
"movapd %3,%%xmm5\n\t" // 4. 4. ; xmm5
"movapd %1,%%xmm6\n\t" // a0 a1 ; xmm6
"movaps %2,%%xmm7\n\t" // b0 b1 ; xmm7
"xorpd %%xmm0,%%xmm0\n\t" // 0. 0. ; rez in xmm0
"xorpd %%xmm1,%%xmm1\n\t" // 0. 0. ; imz in xmm1
"xorpd %%xmm3,%%xmm3\n\t" // 0. 0. ; bailout counters
"xorpd %%xmm8,%%xmm8\n\t" // 0. 0. ; bailout counters
"xorpd %%xmm9,%%xmm9\n\t" // 0. 0. ; bailout counters

"1:\n\t" // Main Mandelbrot computation
"movapd %%xmm0,%%xmm2\n\t" // x0 x1 ; xmm2
"mulpd %%xmm1,%%xmm2\n\t" // x0*y0 x1*y1 ; xmm2
"mulpd %%xmm0,%%xmm0\n\t" // x0^2 x1^2 ; xmm0
"mulpd %%xmm1,%%xmm1\n\t" // y0^2 y1^2 ; xmm1
"movapd %%xmm0,%%xmm4\n\t" //
"addpd %%xmm1,%%xmm4\n\t" // x0^2+y0^2 x1... ; xmm4
"subpd %%xmm1,%%xmm0\n\t" // x0^2-y0^2 x1... ; xmm0
"addpd %%xmm6,%%xmm0\n\t" // x0' x1' ; xmm0
"movapd %%xmm2,%%xmm1\n\t" // x0*y0 x1*y1 ; xmm1
"addpd %%xmm1,%%xmm1\n\t" // 2x0*y0 2x1*y1 ; xmm1
"addpd %%xmm7,%%xmm1\n\t" // y0' y1' ; xmm1

"cmpltpd %%xmm5,%%xmm4\n\t" // <4 <4 ; xmm2
"movapd %%xmm4,%%xmm2\n\t" // xmm2 has all 1s in the non-overflowed pixels
"movmskpd %%xmm4,%%eax\n\t" // (lower 2 bits reflect comparisons)
"andpd %4,%%xmm4\n\t" // so, prepare to increase the non-overflowed (and with ones)
"addpd %%xmm4,%%xmm3\n\t" // by updating their counters

"or %%eax,%%eax\n\t" // have both pixels overflowed ?

"je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop
"dec %%ecx\n\t" // otherwise, repeat the loop ITERA times...
"jnz 22f\n\t" // but before redoing the loop, first do periodicity checking

// We've done the loop ITERA times.
// Set non-overflowed outputs to 0 (inside xmm3). Here's how:
"movapd %%xmm2,%%xmm4\n\t" // xmm4 has all 1s in the non-overflowed pixels...
"xorpd %5,%%xmm4\n\t" // xmm4 has all 1s in the overflowed pixels (toggled, via xoring with allbits)
"andpd %%xmm4,%%xmm3\n\t" // zero out the xmm3 parts that belong to non-overflowed (set to black)
"jmp 2f\n\t" // And jump to end of everything, where xmm3 is written into outputs

"22:\n\t" // Periodicity checking
"inc %%bl\n\t" // period++
"and $0xF, %%bl\n\t" // period &= 0xF
"jnz 11f\n\t" // if period is not zero, continue to check if we're seeing xold, yold again
"movapd %%xmm0, %%xmm8\n\t" // time to update xold[2], yold[2] - store xold[2] in xmm8
"movapd %%xmm1, %%xmm9\n\t" // and yold[2] in xmm9
"jmp 1b\n\t" // and jump back to the loop beginning

"11:\n\t" // are we seeing xold[2], yold[2] into our rez[2], imz[2]?
"movapd %%xmm8, %%xmm10\n\t" // the comparison instruction will modify the target XMM register, so use xmm10
"cmpeqpd %%xmm0, %%xmm10\n\t" // compare xmm10 (which now has xold[2]) with rez[2]. Set all 1s into xmm10 if equal
"movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison.
"or %%eax, %%eax\n\t" // are they BOTH zero?
"jz 1b\n\t" // Yes - so, neither of the two rez matched with the two xold. Repeat the loop
"movapd %%xmm9, %%xmm10\n\t" // Set xmm10 to contain yold[2]
"cmpeqpd %%xmm1, %%xmm10\n\t" // compare xmm10 with imz[2]. Set all 1s into xmm10 if equal
"movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison.
"or %%eax, %%eax\n\t" // are they BOTH zero?
"jz 1b\n\t" // Yes - so, neither of the two imz matched with the two yold. Repeat the loop
"xorpd %%xmm3,%%xmm3\n\t" // Repetition detected. Set both results to 0.0 (both pixels black)

"2:\n\t"
"movapd %%xmm3,%0\n\t"
:"=m"(outputs[0])
:"m"(re[0]),"m"(im[0]),"m"(fours[0]),"m"(ones[0]),"m"(allbits[0]),"i"(ITERA)
:"%eax","%ebx","%ecx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","memory");

tmp = (int)(outputs[0]);
*(*p)++ = tmp;
tmp = (int)(outputs[1]);
*(*p)++ = tmp;
}

void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p)
{
DECLARE_ALIGNED(32,double,re[4]);
Expand Down
1 change: 1 addition & 0 deletions src/sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define __MANDELSSE_H__

void CoreLoopDoubleDefault(double xcur, double ycur, double xstep, unsigned char **p);
void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p);
void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p);

#endif
2 changes: 1 addition & 1 deletion src/xaos.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ int compare_points(const void *p1, const void *p2)
}

#if defined(__x86_64__) && !defined(__WIN64__)
#define AUTO_DISPATCH __attribute__((target_clones("default","avx")))
#define AUTO_DISPATCH __attribute__((target_clones("default","sse","avx")))
#else
#define AUTO_DISPATCH
#endif
Expand Down

0 comments on commit 497a59a

Please sign in to comment.