## Wednesday, 10 March 2010

### F# vs Unmanaged C++ for parallel numerics

We obtained a surprising performance result when comparing optimized parallel ray tracers written in F# and C++ recently. The following two programs render the same highly complex scenes containing over a million objects. Surprisingly, the 136-line managed F# program runs slightly faster at 17s than the 168-line unmanaged C++ which takes 18s.

Here is the F# program:

```let t = System.Diagnostics.Stopwatch.StartNew()

let delta = sqrt epsilon_float

[<Struct>]
type vec =
val x : float
val y : float
val z : float

new(x, y, z) = {x=x; y=y; z=z}

static member ( + ) (a : vec, b : vec) = new vec(a.x+b.x, a.y+b.y, a.z+b.z)
static member ( - ) (a : vec, b : vec) = new vec(a.x-b.x, a.y-b.y, a.z-b.z)
static member ( * ) (a, b : vec) = new vec(a*b.x, a*b.y, a*b.z)

let vec x y z = new vec(x, y, z)

let dot (a : vec) (b : vec) = a.x * b.x + a.y * b.y + a.z * b.z
let length r = sqrt(dot r r)
let unitise r = 1. / length r * r

type scene = Scene of vec * float * scene_type
and scene_type =
| Sphere
| Group of scene * scene * scene * scene * scene

let infinity = System.Double.PositiveInfinity

let inline ray_sphere (d : vec) (v : vec) r =
let vx = v.x
let vy = v.y
let vz = v.z
let disc = vx * vx + vy * vy + vz * vz - r * r
if disc < 0. then infinity else
let b = vx * d.x + vy * d.y + vz * d.z
let b2 = b * b
if b2 < disc then infinity else
let disc = sqrt(b2 - disc)
let t1 = b - disc in
if t1 > 0. then t1 else b + disc

let ray_sphere' (o : vec) (d : vec) (c : vec) r =
let vx = c.x - o.x
let vy = c.y - o.y
let vz = c.z - o.z
let vv = vx * vx + vy * vy + vz * vz
let b = vx * d.x + vy * d.y + vz * d.z
let disc = b * b - vv + r * r
disc >= 0. && b + sqrt disc >= 0.

type hit = {mutable l: float; mutable nx: float; mutable ny: float; mutable nz: float}

let intersect_sphere (dir: vec) hit l' (center: vec) =
let x = l' * dir.x - center.x
let y = l' * dir.y - center.y
let z = l' * dir.z - center.z
let il = 1. / sqrt(x * x + y * y + z * z)
hit.l <- l'
hit.nx <- il * x
hit.ny <- il * y
hit.nz <- il * z

let rec intersect dir hit = function
let l' = ray_sphere dir center radius
if l' < hit.l then
match typ with
| Sphere ->
intersect_sphere dir hit l' center
| Group(a, b, c, d, e) ->
intersect dir hit a
intersect dir hit b
intersect dir hit c
intersect dir hit d
intersect dir hit e

let rec intersect' orig dir = function
ray_sphere' orig dir center radius &&
match typ with
| Sphere -> true
| Group (a, b, c, d, e) ->
intersect' orig dir a ||
intersect' orig dir b ||
intersect' orig dir c ||
intersect' orig dir d ||
intersect' orig dir e

let neg_light = unitise(vec 1. 3. -2.)

let rec ray_trace dir scene =
let hit = {l=infinity; nx=0.; ny=0.; nz=0.}
intersect dir hit scene;
if hit.l = infinity then 0. else
let n = vec hit.nx hit.ny hit.nz in
let g = dot n neg_light in
if g < 0. then 0. else
if intersect' (hit.l * dir + delta * n) neg_light scene then 0. else g

let fold5 f x a b c d e = f (f (f (f (f x a) b) c) d) e

let rec create level c r =
let obj = Scene(c, r,Sphere)
if level = 1 then obj else
let a = 3. * r / sqrt 12.
let rec bound (c, r) = function
| Scene(c', r',Sphere) -> c, max r (length (c - c') + r')
| Scene(_, _, Group(v, w, x, y, z)) -> fold5 bound (c, r) v w x y z
let aux x' z' = create (level - 1) (c + vec x' a z') (0.5 * r)
let w, x, y, z = aux a (-a), aux (-a) (-a), aux a a, aux (-a) a
let c, r = fold5 bound (c + vec 0. r 0., 0.) obj w x y z in
Scene(c, r, Group(obj, w, x, y, z))

let level, n = 11, 2048

let scene = create level (vec 0. -1. 4.) 1.
let ss = 4

do
use ch = System.IO.File.Create("C:\Users\Jon\Documents\Visual Studio 10\Projects\F#\RayTracer\image.pgm", n * n + 32)
sprintf "P5\n%d %d\n255\n" n n
|> String.iter (ch.WriteByte << byte)
let image = Array.create (n*n) 0uy
for x = 0 to n - 1 do
let mutable g = 0.
for dx = 0 to ss - 1 do
for dy = 0 to ss - 1 do
let aux x d = float x - 0.5 * float n + float d / float ss
let dir = unitise(vec (aux x dx) (aux y dy) (float n))
g <- g + ray_trace dir scene
image.[x+y*n] <- 0.5 + 255. * g / float (ss*ss) |> byte)
|> ignore
ch.Write(image, 0, n*n)
printf "Took %gs" t.Elapsed.TotalSeconds```

And here is the C++ program:

```#include "stdafx.h"
#include <vector>
#include <iostream>
#include <fstream>
#include <limits>
#include <cmath>

using namespace std;

numeric_limits<double> real;

double delta = sqrt(real.epsilon()), infinity = real.infinity();

struct Vec {
double x, y, z;
Vec(double x2, double y2, double z2) : x(x2), y(y2), z(z2) {}
};

Vec operator+(const Vec &a, const Vec &b)
{ return Vec(a.x+b.x, a.y+b.y, a.z+b.z); }

Vec operator-(const Vec &a, const Vec &b)
{ return Vec(a.x-b.x, a.y-b.y, a.z-b.z); }

Vec operator*(double a, const Vec &b) { return Vec(a*b.x, a*b.y, a*b.z); }

double dot(const Vec &a, const Vec &b) { return a.x*b.x + a.y*b.y + a.z*b.z; }

double length(const Vec &a) { return sqrt(dot(a, a)); }

Vec unitise(const Vec &a) { return (1 / sqrt(dot(a, a))) * a; }

struct Hit {
double lambda;
Vec normal;
Hit() : lambda(infinity), normal(Vec(0, 0, 0)) {}
};

struct Sphere;

struct Scene {
virtual ~Scene() {};
virtual void intersect(Hit &, const Vec &) const = 0;
virtual bool intersect(const Vec &, const Vec &) const = 0;
virtual Sphere bound(Sphere b) const = 0;
};

struct Sphere : public Scene {
Vec center;
Sphere(Vec c, double r) : center(c), radius(r) {}
~Sphere() {}
double ray_sphere(const Vec &dir) const {
double b = center.x*dir.x + center.y*dir.y + center.z*dir.z;
if (disc > 0) {
double d = sqrt(disc), t2 = b + d;
if (t2 > 0) {
double t1 = b - d;
return (t1 > 0 ? t1 : t2);
}
}
return infinity;
}
bool sray_sphere(const Vec &orig, const Vec &dir) const {
Vec v = center - orig;
double b = dot(v, dir), disc = b*b - dot(v, v) + radius * radius;
return (disc < 0 ? false : b + sqrt(disc) >= 0);
}
void intersect(Hit &hit, const Vec &dir) const {
double lambda = ray_sphere(dir);
if (lambda < hit.lambda) {
hit.lambda = lambda;
double
nx = lambda*dir.x - center.x,
ny = lambda*dir.y - center.y,
nz = lambda*dir.z - center.z;
double il = 1/sqrt(nx*nx + ny*ny + nz*nz);
hit.normal.x = il*nx;
hit.normal.y = il*ny;
hit.normal.z = il*nz;
}
}
bool intersect(const Vec &orig, const Vec &dir) const
{ return sray_sphere(orig, dir); }
Sphere bound(Sphere b) const {
}
};

typedef vector<Scene *> Scenes;

struct Group : public Scene {
Sphere b;
Scenes child;
Group(Sphere b, Scenes c) : b(b), child(c) {}
~Group() {
for (Scenes::const_iterator it=child.begin(); it!=child.end(); ++it)
delete *it;
}
void intersect(Hit &hit, const Vec &dir) const {
if (b.ray_sphere(dir) < hit.lambda)
for (Scenes::const_iterator it=child.begin(); it!=child.end(); ++it)
(*it)->intersect(hit, dir);
}
bool intersect(const Vec &orig, const Vec &dir) const {
if (!b.sray_sphere(orig, dir)) return false;
for (Scenes::const_iterator it=child.begin(); it!=child.end(); ++it)
if ((*it)->intersect(orig, dir)) return true;
return false;
}
Sphere bound(Sphere b) const {
Sphere b2 = b;
for (Scenes::const_iterator it=child.begin(); it!=child.end(); ++it)
b2 = (*it)->bound(b2);
return b2;
}
};

double ray_trace(const Vec &neg_light, const Vec &dir, const Scene &s) {
Hit hit;
s.intersect(hit, dir);
if (hit.lambda == infinity) return 0;
double g = dot(hit.normal, neg_light);
if (g < 0) return 0.;
Vec p = hit.lambda*dir + delta*hit.normal;
return (s.intersect(p, neg_light) ? 0 : g);
}

Scene *create(int level, Vec c, double r) {
Scene *s = new Sphere(c, r);
if (level == 1) return s;
Scenes child;
child.reserve(5);
child.push_back(s);
double rn = 3*r/sqrt(12.);
for (int dz=-1; dz<=1; dz+=2)
for (int dx=-1; dx<=1; dx+=2)
child.push_back(create(level-1, c + rn*Vec(dx, 1, dz), r/2));
Sphere b2(c + Vec(0, r, 0), r);
for (Scenes::const_iterator it=child.begin(); it!=child.end(); ++it)
b2 = (*it)->bound(b2);
return new Group(b2, child);
}

int _tmain(int argc, char* argv[]) {
int level = (argc==3 ? atoi(argv) : 11),
n = (argc==3 ? atoi(argv) : 2048), ss = 4;
Vec neg_light = unitise(Vec(1, 3, -2));
Scene *s(create(level, Vec(0, -1, 4), 1));
ofstream out;
std::vector<char> image(n*n);
#pragma omp parallel for
for (int y=0; y<n; ++y)
for (int x=0; x<n; ++x) {
double g=0;
for (int dx=0; dx<ss; ++dx)
for (int dy=0; dy<ss; ++dy) {
Vec dir(unitise(Vec(x+dx*1./ss-n/2., y+dy*1./ss-n/2., n)));
g += ray_trace(neg_light, dir, *s);
}
image[x+(n-1-y)*n] = .5 + 255. * g / (ss*ss);
}
out.open("C:\\Users\\Jon\\Documents\\image.pgm", std::ios_base::binary);
out << "P5\n" << n << " " << n << "\n255\n";
out.write(&image, n*n);
return 0;
}``` Is it possible to publish also the image.png? Anonymous said...

With TBB and SIMD support, C++ would absolutely cream F# and you know it :)

Tom Leys said...

“when comparing optimized parallel ray tracers written in F# and C++ recently.”

I doubt that the C++ implementation is optimised. For a start, an optimised C++ ray-tracer would probably be data oriented not object oriented.

Also, it would use technologies such as SIMD that are not available to the F# implementation.

Finally it wouldn’t put virtual function calls inside the tightest loop possible

struct Scene {

virtual ~Scene() {};

// Called multiple times per pixel:
virtual void intersect(Hit &, const Vec &) const = 0;

virtual bool intersect(const Vec &, const Vec &) const = 0;

virtual Sphere bound(Sphere b) const = 0;

};

Called from

for (int y=0; y Sorry, the comment form nuked my indenting. Pre tag wasn't allowed.

So yeah, F# beats unoptimised C++ - Gee wizz, what a surprise.

Tom Leys said...

The second half of my comment got truncated somehow.

Called from

for (int y=0; y<n; ++y)
for (int x=0; x<n; ++x) {
double g=0;
for (int dx=0; dx<ss; ++dx)
for (int dy=0; dy<ss; ++dy) {
Vec dir(unitise(Vec(x+dx*1./ss-n/2., y+dy*1./ss-n/2., n)));
//Called here (ray_trace calls Scene::intersect):
g += ray_trace(neg_light, dir, *s);
}

-Tom

Michael Robin said...

>So yeah, F# beats unoptimised C++ - Gee wizz, what a surprise.

Well, that *is* amazing, in the same way that drinking a glass of wine at 40k feet is amazing. Just because you're jaded doesn't make it less so. Only a few years ago you couldn't convince someone that any lang. w/F#'s features could even approach C++ (optimized or not) by an order of magnitude. The conclusion is clear, that there's very little reason to code in C++ which requires many more programmer cycles to get right (it's easier to make a clusterf*ck of a C++ program) than F#, where they're so close in run-time cycles.

Jon Harrop said...

@Tom Leys: The C++ version is from here where it has been read by hundreds of thousands of people and improved by dozens of people over the course of several years. If you can optimize it further, please do.

While we are making unsubstantiated speculations: my guess is that using Intel's C++ compiler would provide a much larger performance improvement than trying to use SIMD (when possible) or removing those virtual function calls (which the compiler has almost certainly already done anyway). Anonymous said...

Changing

static member ( + ) (a : vec, b : vec) = new vec(a.x+b.x, a.y+b.y, a.z+b.z)
static member ( - ) (a : vec, b : vec) = new vec(a.x-b.x, a.y-b.y, a.z-b.z)
static member ( * ) (a, b : vec) = new vec(a*b.x, a*b.y, a*b.z)

to

let (+.) (a : vec) (b : vec) = new vec(a.x-b.x, a.y-b.y, a.z-b.z)
let (-.) (a : vec) (b : vec) = new vec(a.x-b.x, a.y-b.y, a.z-b.z)

and adjusting the call-sites accordingly speeds up the F# by about a factor of about 2.5 on my box. Structs+JIT+inlining maybe?

Dmitri said...

I'm sorry but if you read overviews people do of C# vs C++ (yep, that's C#, but it shouldn't matter), you'll see that C# does, in fact, lag behind by an order of magnitude.

For enterprise software with a financial/math slant, F# is just fine. But for pedal-to-the-metal performance... you've got to be kidding.

Jon Harrop said...

@Dmitri: Then how do you explain this result?

Michael Robin said...

>C# does, in fact, lag behind by an order of magnitude

Can you point at some (newish) benchmarks? I didn't even think C# 1.x on Mono was quite this bad, on avg. Current C# tooling ends up somewhere between 0.9x and 2.0x of C++. For non-trivial programs written by average programmers, C# tends to beat out C++ by a large margin, do to un-tuned memory mgmnt (copy c'tors being called unbeknownst to the coder, etc.)

Art said...

error FS0039: The value or constructor 'epsilon_float' is not defined

Jon Harrop said...

@Art: epsilon_float is now in the F# PowerPack Compatibility DLL. Anonymous said...

Completely I share your opinion. In it something is also idea excellent, I support. Anonymous said...

Very Interesting!
Thank You! Anonymous said...

In reality C++ stl containers and iterators are not the fastest solution (especially what comes with Visual Studio). They are quite generic and have a lot of redundancy (including thread-safety locking in places where you do not expect them). Just go through the code in the debugger and you will be really amazed by what you will discover. So using proper designed custom containers will be much more efficient (though it will take much time to write the code). Anonymous said... Anonymous said...

Thank you for posting this, it was quite helpful and told a lot Anonymous said...

А! Vielen Dank für die hilfreichen Beitrag! Ich würde nicht anders bekommen haben! Anonymous said...

I’ve meant to post about something like this on my webpage and you gave me an idea. Cheers.

using vs2010, i tried to replecate the result but using ms's parrallel_for instead of openmp's directive and i come with different result (namely the c++ version beeing fastest). Did someone else try that ?

Arash Salarian said...

As Keita Abdoul-kader mentioned, simply replacing OpenMP with PPL in visual C++ 2010 makes a dramatic change in the C++ performance. On my machine, the C++ version with OpenMP takes 32.5 seconds to run. With PPL the runtime goes down to 7.3 seconds! Oh, btw, the runtime for the F# version was 13.4 seconds.

For those who want to repeat the experiment, in the main change the outer for loop from:

#pragma omp parallel for
for (int y=0; y<n; ++y) {
...
}

to

Concurrency::parallel_for(0, n, [&](int y){
...
});

Arash Salarian said...
This comment has been removed by the author.
Arash Salarian said...

Ooops! Turns out that OpenMP was not turned on in my project. Turning it on (Properties/C/C++/Language/Open MP Support), the OpenMP C++ version runs in 9.95 seconds; still slower than the PPL version but faster than the F# one. I didn't touch any other part of the F# or C++ programs.