PSchn - 5 months ago 29

C++ Question

I have a range-image and want to convert it into a libpointmatcher point cloud. The cloud is an

`Eigen::Matrix`

The range-image is an

`unsigned short*`

`unsigned char*`

In serial, my code looks like this:

`//container to hold the data`

std::vector<Eigen::Vector4d> vec;

vec.reserve(this->Height*this->Width);

//contains information about pixel visibility

unsigned char* mask_data = (unsigned char*)range_image.mask.ToPointer();

//contains the actual pixel data

unsigned short* pixel_data = (unsigned short*)range_image.pixel.ToPointer();

for (int y =0;y < range_image.Height; y++)

{

for (int x = 0; x < range_image.Width; x++)

{

int index =x+y*range_image.Width;

if(*(mask_data+index) != 0)

{

vec.push_back(Eigen::Vector4d(x,y,(double)*(data+index),1));

}

}

}

// libpointmatcher point cloud with size of visible pixel

PM::Matrix features(4,vec.size());

PM::DataPoints::Labels featureLabels;

featureLabels.resize(4);

featureLabels[0] = PM::DataPoints::Label::Label("x");

featureLabels[1] = PM::DataPoints::Label::Label("y");

featureLabels[2] = PM::DataPoints::Label::Label("z");

featureLabels[3] = PM::DataPoints::Label::Label("pad");

//fill with data

for(int i = 0; i<vec.size(); i++)

{

features.col(i) = vec[i];

}

Because of the large images this loop takes 500ms for 840000 points and thats too slow. Now my idea was to integrate the code above in one parallized function. The problem is that the

`Eigen::Matrix`

`push_back`

So i need a parallel algorithm to extract visible 3D-Points from my range-image and insert them into the Eigen::Matrix in the right order. I'm working with

As Arch D. Robison suggeested i tried the

`tbb::parallel_scan`

`size_t vec_size = width*height;`

double* out = new double[vec_size * 4];

size_t m1 = Compress(mask, pixel, out, height, width,

[](unsigned char x) {return x != 0; });

Map<MatrixXd> features(out, 4, m1);

. Here is the code from the

`operator()`

`void operator()(const tbb::blocked_range2d<size_t, size_t>& r, Tag) {`

// Use local variables instead of member fields inside the loop,

// to improve odds that values will be kept in registers.

size_t j = sum;

const unsigned char* m = in;

const unsigned short* p = in2;

T* values = out;

size_t yend = r.rows().end();

for (size_t y = r.rows().begin(); y != yend; ++y)

{

size_t xend = r.cols().end();

for (size_t x = r.cols().begin(); x != xend; ++x)

{

size_t index = x + y*width;

if (pred(m[index]))

{

if (Tag::is_final_scan())

{

size_t idx = j*4;

values[idx] = (double)x;

values[idx + 1] = (double)y;

values[idx + 2] = p[index];

values[idx + 3] = 1.0;

}

++j;

}

}

}

sum = j;

}

I'm now 4x faster then the serial version. What do you think about this approach? Did i miss anythink and are there improvements? Thanks

Answer

Here is an example of how to do something like `std::copy_if using`

`tbb::parallel_scan`

. The key method is `operator()`

, which is usually called twice per subrange, once for a prescan and once for a final scan. (But be aware that TBB omits the prescan when it's not necessary.) Here the prescan just does tallying and the final scan does the final work (which includes replaying the tallying). See https://software.intel.com/sites/default/files/bc/2b/parallel_scan.pdf for more details on the methods. Another good references is https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf , which shows lots of things you can do with parallel scan (a.k.a. prefix-sum).

```

```
#include "tbb/parallel_scan.h"
#include "tbb/blocked_range.h"
#include <cstddef>
template<typename T, typename Pred>
class Body {
const T* const in;
T* const out;
Pred pred;
size_t sum;
public:
Body( T* in_, T* out_, Pred pred_) :
in(in_), out(out_), pred(pred_), sum(0)
{}
size_t getSum() const {return sum;}
template<typename Tag>
void operator()( const tbb::blocked_range<size_t>& r, Tag ) {
// Use local variables instead of member fields inside the loop,
// to improve odds that values will be kept in registers.
size_t j = sum;
const T* x = in;
T* y = out;
for( size_t i=r.begin(); i<r.end(); ++i ) {
if( pred(x[i]) ) {
if( Tag::is_final_scan() )
y[j] = x[i];
++j;
}
}
sum = j;
}
// Splitting constructor used for parallel fork.
// Note that it's sum(0), not sum(b.sum), because this
// constructor will be used to compute a partial sum.
// Method reverse_join will put together the two sub-sums.
Body( Body& b, tbb::split ) :
in(b.in), out(b.out), pred(b.pred), sum(0)
{}
// Join partial solutions computed by two Body objects.
// Arguments "this" and "a" correspond to the splitting
// constructor arguments "b" and "this". That's why
// it's called a reverse join.
void reverse_join( Body& a ) {
sum += a.sum;
}
void assign( Body& b ) {sum=b.sum;}
};
// Copy to out each element of in that satisfies pred.
// Return number of elements copied.
template<typename T, typename Pred>
size_t Compress( T* in, T* out, size_t n, Pred pred ) {
Body<T,Pred> b(in,out,pred);
tbb::parallel_scan(tbb::blocked_range<size_t>(0,n), b);
return b.getSum();
}
#include <cmath>
#include <algorithm>
#include <cassert>
int main() {
const size_t n = 10000000;
float* a = new float[n];
float* b = new float[n];
float* c = new float[n];
for( size_t i=0; i<n; ++i )
a[i] = std::cos(float(i));
size_t m1 = Compress(a, b, n, [](float x) {return x<0;});
size_t m2 = std::copy_if(a, a+n, c, [](float x) {return x<0;})-c;
assert(m1==m2);
for( size_t i=0; i<n; ++i )
assert(b[i]==c[i]);
}
```

```