SlideShare a Scribd company logo
1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
/*
* multi.cpp
*
* Created on: 13 Mar 2014
* Author: Russell John Childs.
*/
//=======================================================================================================
// COPYRIGHT NOTICE
// This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been
// distributed as a representative example of my use of C++11 features.
//==========================================================================================================
============
//====================================================
// File contains
// (1) Implementation of lock-based thread pool.
// (2) Implementation of lock-free "thread pool".
// (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge
processor)
// Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD
parallel for):
// 1) Split sorted array into <num_threads> equal chunks
// 2) Assign each chunk to a thread.
// 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end()
// 4) Replace array with chunk that returned true and return to step 1
// Complexity:
// t - number of threads ( > 1 )
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Compiling this code sample (Linux Mint - g++ 4.8)
//
// Compiler options:
// g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer
// --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include
/
// multithreading.cpp
//
// Linker options:
// g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $
(LIBS) -lpthread -latomic -littnotify -ldl
//
//==============================================================
#include <thread>
#include <future>
#include <condition_variable>
#include <atomic>
#include <functional>
#include <deque>
#include <vector>
#include <set>
#include <iostream>
#include <string>
#include <sstream>
#include <cmath>
#include <algorithm>
#include <omp.h>
#include <immintrin.h>
//#include <cilk/cilk.h>
//Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler.
#define INTEL_NO_ITTNOTIFY_API
//include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifndef INTEL_NO_ITTNOTIFY_API
#include "ittnotify.h"
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE(STATEMENT)
#else
2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
#define VTUNE(STATEMENT) STATEMENT
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS
#else
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) 
{ 
auto domain = __itt_domain_create(DOMAIN); 
__itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); 
STATEMENTS 
__itt_task_end(domain); 
}
#endif
// =================================================================
// Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ...
// storable in STL container for thread pool.
// ===
// N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type
(Type)>(type),
// since packaged_task has void operator()(void). However, there is a problem: std::function
// requires command object to be copyable and packaged_task has move-only semantics.
//==================================================================
//============================
//Primary template
//============================
template< typename Out = void, typename In = void >
struct MyPackagedTask
{
virtual ~MyPackagedTask(void)
{
}
};
//============================
//Explicit specialization, acts as base class
// MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>;
// poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op()
//============================
template<>
struct MyPackagedTask<>
{
virtual ~MyPackagedTask(void)
{
}
virtual void operator()(void)
{
}
};
std::mutex last_return_mutex;
//============================
//Specialization for function signature
// MyPackagedTaks<MyType(OtherType)>
//============================
template< typename Out, typename... In >
struct MyPackagedTask< Out(In...) > : public MyPackagedTask<>
{
MyPackagedTask(std::function<Out(In...)> func, In... in) :
m_task(std::bind(func, in...))
{
}
virtual ~MyPackagedTask(void)
{
}
MyPackagedTask(MyPackagedTask&& other) :
m_task(std::move(other.m_task))
3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
}
void operator()(void)
{
m_task();
}
std::future<Out> get_future(void)
{
return m_task.get_future();
}
private:
std::packaged_task<Out(void)> m_task;
};
//======================================================================
// Simple thread pool class
// Places tasks onto common queue
// Allocates fixed number of threads which pop tasks.
// TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation.
//====================================================================
class ThreadPool
{
public:
ThreadPool(unsigned max_num_threads = 1U << 31) :
m_done(false), //notice to threads to shut down
m_print_shutdown_msg(true), //print or not print shutdown msg
m_max_num_threads(max_num_threads), //maximum num threads allowed in pool
m_num_threads(0), //num threads allocated by the pool
m_processing(0), //tasks still running
m_cancel(false)
{
}
~ThreadPool(void)
{
//Shut down threads iff user has not alread called shutdown()
if (!m_done)
{
shutdown();
}
}
//=================
// Push task onto pool
//================
template< typename Out, typename... In >
std::future<Out> push(std::function<Out(In...)> func, In... in)
{
//Create task, store future
MyPackagedTask<Out(In...)> task(func, in...);
std::future<Out> ret_val = task.get_future();
//lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release
lock
if (m_cancel == false)
{
{
std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); //
Base*=&Derived for poly call
std::lock_guard<std::mutex> lock(m_tasks); //lock queue
m_pool.push_back(std::move(ptr)); //push task
} //release lock
m_condition_variable.notify_all(); //notify waiting threads
//spawn a thread(async will prevent oversubscription) and store thread future(to check for
thread termination at pool shutdown)
if ((++m_num_threads <= m_max_num_threads))
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this));
}
4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
}
//return packaged_task future so that caller can wait for result
return ret_val;
}
//=================
// get number of threads allocated
//================
unsigned get_num_threads(void)
{
std::unique_lock<std::mutex> lock(m_threads);
return m_thread_list.size();
}
//=================
// Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest
tested.
//================
void cancel_tasks(void)
{
m_cancel = true;
while (m_processing != 0);
{
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_pool.clear();
}
m_cancel = false;
}
//=================
// Kill all threads and print out shutdown message (iff msg==true)
//================
void shutdown(bool msg = true)
{
m_print_shutdown_msg = msg;
{
if (m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================
" << std::endl
<< "Shutting down threads: ";
}
}
cancel_tasks();
//Notify all threads of thread pool termination
m_done = true;
m_condition_variable.notify_all();
//Loop over all threads and wait for them to terminate
{
std::unique_lock<std::mutex> lock(m_threads);
for (auto& elem : m_thread_list)
{
while (!elem.valid());
elem.get();
}
}
//Clear thread queue
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.clear();
}
//Print out shutdown message
if (m_print_shutdown_msg)
5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================" <<
std::endl;
}
}
private:
//=================
// Pop and run tasks in threads.
//================
void run_tasks(void)
{
//To avoid branch misprediction, use array to store branch code instead of if-else
std::unique_ptr<MyPackagedTask<>> func;
std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front();
};
std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new
MyPackagedTask<>); };//NOP
std::function<void(void)> switch_func[2]{ branch_false, branch_true};
while (!m_done)
{
// Only wait if there are still tasks to be processed
{
bool empty; //Status of task queue
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty =
m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown
switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown.
}
++m_processing;
(*func)();
--m_processing;
}
//Print out shutdown msg
if (m_done & m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::this_thread::get_id() << " ";
}
}
std::atomic<bool> m_done;
std::atomic<bool> m_print_shutdown_msg;
std::atomic<unsigned> m_max_num_threads;
std::atomic<unsigned> m_num_threads;
std::atomic<unsigned> m_processing;
std::atomic<bool> m_cancel;
std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool;
std::vector< std::future<void> > m_thread_list;
std::mutex m_threads;
std::mutex m_tasks;
std::mutex m_shutdown;
std::condition_variable m_condition_variable;
};
//=====================================
// Simple test class
// Creates a few tasks, pushes them onto thread pool, gets results
//==================================================================
struct SimpleTest
{
SimpleTest(void) try
{
std::cout << std::endl << "Simple Test......" << std::endl << std::endl;
//Create thread pool
ThreadPool thread_pool;
6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
//create a task
std::thread::id f1_id;
std::function< int(int, int) > f1 = [&](int i, int j)
{
f1_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return i*j;
};
//create another task
std::thread::id f2_id;
std::function< std::string(void) > f2 = [&](void)
{
f2_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return std::string("return value of f2");
};
//create another task
std::thread::id f3_id;
std::string f3_str;
std::function< void(void) > f3 = [&](void)
{
f3_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
f3_str = "f3 called";
};
//push tasks
auto start = std::chrono::high_resolution_clock::now(); //start timer
std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20)));
std::future<std::string> fut_2 = thread_pool.push(f2);
int fut_1_res = fut_1.get();
std::string fut_2_res = fut_2.get();
auto end = std::chrono::high_resolution_clock::now(); //stop timer
//std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error.
// std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future
<void>
//std::future<void> test_fut; //compiles
//std::future<void> test_fut1 = std::move(test_fut); //compiles
//thread_pool.push(f3); // doesn't compile
// std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles
// thread_pool.push(f4, 2); //compiles
// std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles
// thread_pool.push(f4, 2); //doesn't compile
//print num of threads running, thread id for tasks, result sent back by tasks
std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl;
std::cout << "f1 thread id=" << f1_id << std::endl;
std::cout << "f1's result: " << fut_1_res << std::endl;
std::cout << "f2 thread id=" << f2_id << std::endl;
std::cout << "f2's result: " << fut_2_res << std::endl;
//std::cout << "f3 thread id=" << f3_id << std::endl;
//std::cout << "f3's result: " << f3_str << std::endl;
std::cout << "thread_pool time = "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std::
endl;
//cleanup threads
//thread_pool.shutdown(); test dtor
}
catch (...)
{
std::cout << "SimpleTest exception" << std::endl;
}
};
//==============================================================
// Parallel vs binary search test class
// t - number of threads ( > 1)
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Binary = std::find (single threaded for comparison).
7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
// Parallel: (1) Split array into equal chuncks, push them onto thread pool
// (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val
<= end().
// (3) Chunk returning true replaces array and step(1) repeated.
// N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match
// Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is
quicker if early match found).
// Benchmarks show high overhead of thread pool.
//===============================================================
struct ParallelSearch
{
//Choose which to run
bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to
100 since the search is time consuming)
bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below,
to 100 since the search is time consuming)
bool is_simd = true; //run simd-based lambda
//Choose number of threads
//unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead)
//const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription,
should run slower than optimal
const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal
choice
//const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive
oversubsrciption, should run slower tha optimal
ParallelSearch(void) try
{
std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion
//Create large, sorted array on heap to avoid seg fault.
const unsigned size = 2 << 24;
std::vector<unsigned> my_array(size);
for (auto& elem : my_array)
{
static unsigned i = 0;
elem = 2 * i; //even numbers
++i;
}
//double-word atomic containing the address of a matching chunk and the new new chunk length (size,
size/t, size/t^2 ...)
struct DoubleWord
{
unsigned* m_address;
unsigned m_chunk_length;
};
std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads });
//val seacrched for (TODO: binary search faster than parallel search if binary finds early match.
Need to terminate parallel search earlier)
bool even = true;
unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); //
even/odd number --> found/not found
//Variables for found position, passes taken and whether to printout progress(incurs overhead)
unsigned* ret_val = &my_array[0];
int passes = 0; //int required by g++ autovectorize
bool printout = false;
//SIMD lambda (Proved to be quite difficult getting g++ to autovectorise)
//(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor)
// 1. Split array into t chunks
// 2. Allocate chunks to t SIMD lanes
// 3. Each lane checks chunk.begin() <= search-val <= chunk.end()
// 4. The SIMD lane getting a match set array = chunk
// 5. Steps 1 to 4 repeated until chunk is 1 element long.
8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
std::function<bool(void)> simd_search = [&]()
{
//Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register)
const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi
?id=56787)
//Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s))
alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15,
size >> 18, size >> 21, size >> 24, 1};
//Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8,
lower[n]/64 ...
alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size,
7 * size};
//Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised
alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2};
//Running tally of start of chunk to be searched
alignas(alignment) int offset = 0;
alignas(alignment) int tmp_offset = 0;
//Loop until chunk length is 1 element
for (passes = 0; passes<9; ++passes)
{
//Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are
due to memory stalls.
//It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this
search algorithm is unavoidably
//memory-bound unless something along the lines of a heap-ordered array (i.e array is laid
out as a breadth-first n-ary tree) is
//used to convert random access to linear access without need for scatter-gather.
//#pragma omp parallel for //Adds too much overhead
[&]() //Sadly, won't vectorise due to function call
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data
they will use
{
int tmp = pos*chunk_length[passes]; //Get lower index
for chunk interval
__builtin_prefetch(&my_array[0] + offset + tmp); //See if it
removes hotspot from "LINE X", below
__builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if
it removes hotspot from "LINE Y", below
}
}();
//Fork: Assign each chunk to an SIMD lane
//N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not
vectorised. This does autovectorise under g++ 4.8
//N.B. Code has been borken down into painfully simple steps to help autovectoriser and
pinpoint which operations are causing trouble
[&]()
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
alignas(alignment) int chunk = chunk_length[passes];
for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes
{
//Find matching chunk by adding 0 to offset for no-match and chunk address for a
match
alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset
//int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi?
id=56787). Can't use 32-byte AVX.
alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range
alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk
range
unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
prefetch)
unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above
prefecth)
alignas(alignment) bool test_lower = lower_val <= val; //Lower
alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check
alignas(alignment) bool test = test_lower && test_upper; // is search-val inside
chunk for this SIMD lane?
tmp_offset += test*tmp; //Horrible construct to get it to autovec.
It masks out SIMD lanes that don't contain search val.
//Following fails because it is "not suitable for gather" (whatever that means)
//offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+
chunk_length[passes]-1]))*tmp;
//Following fails because of "control flow" (Can't see why g++ doesn't autovec it,
control flow can be reaplced with masked op)
//if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length
[passes]-1])) tmp_offset = tmp;
}
}();
//Join: end of SIMD
//Update chunk start address index
offset = tmp_offset;
/*std::cout << "offset=" << offset << std::endl;
std::cout << "passes=" << passes;
std::cout << ", val=" << val;
std::cout << ", range=[" << array[offset] << "," << array[offset+1];
std::cout<< ", chunk length=" << chunk_length[passes] << std::endl;
*/
}
//Update final index of search-val
ret_val = &my_array[0] + offset;
return true;
};
//Lock-free lambda for each thead
//Operation:
//1. The array is split into t (num of threads) chunks
//2. Each thread examines its chunk
//3. If a match is found in a chunk, the thread changes the array to be that chunk.
//4. The process repeated from step 1.
//t threads continous monitor the array and process their chunk of the array. Since the array
pointer is
// atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is
picked up
// by all threads. No synchronisation is needed.
// arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+
chunk_length/t, begin+2*chunk_length/t], ..)
std::atomic<unsigned> running_threads(0);
std::atomic<bool> go(false);
std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos)
{
//Increment running thread count
++running_threads;
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Keep searching until a thread notifies completion.
while (!done)
{
//capture chunk address and length
DoubleWord capture = chunk_address_and_length;
//Check if search-val between chunk.begin() and chunk.end()
unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length;
unsigned *end = begin + capture.m_chunk_length - 1;
unsigned test1 = *begin, test2 = *end;
if (*begin <= val && val <= *end)
10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
//Print out iterations (adds significant overhead)
static std::mutex printout_mutex;
if (printout)
{
std::unique_lock<std::mutex> lock(printout_mutex);
//Print out iterations (adds significant overhead)
std::cout << "Parallel find (pass " << passes << "): Closest match "
<< *begin << "<=" << val << "<=" << *end
<< ", chunk length=" << capture.m_chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = begin;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
std::function<void(void)> branch_true = [&]() //IF
{
//Update chunk length and address
capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture.
m_chunk_length / num_threads) : 1); //divide chunk evenly
capture.m_address = begin; //point to this chunk
chunk_address_and_length = capture;
};
std::function<void(void)> branch_false = [&]() //ELSE
{
done = true; //notify parent and sister threads of completion
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[capture.m_chunk_length > 1](); //if-else
}
else
{
std::this_thread::sleep_for(std::chrono::nanoseconds(5000));
}
}
return true;;
};
//Create thread pool for lock-based search
static ThreadPool thread_pool(num_threads);
//Notification of completion of lock-based search
std::condition_variable finished;
// lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos
+ chuhnk_length]
// and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion
point.
std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) //
{
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Check if search-val between chunk.begin() and chunk.end()
if (*tmp <= val && val <= *(tmp + chunk_length - 1))
{
//Print out iterations (adds significant overhead)
if (printout)
{
std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" <<
val
<< "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp
+ 1))
<< ", chunk length=" << chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = tmp;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
//Spawn new tasks to process this chunk
//Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to
replace if-else
//need VTune to test out whether it saves us any mispredictions.
std::function<void(void)> branch_true = [&]() //IF
{
chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1);
//divide chunk evenly
for (unsigned index = 0; index < num_threads; ++index)
{
thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length);
}
};
std::function<void(void)> branch_false = [&]() //ELSE
{
finished.notify_one(); //chunk length is 1, so we are finished dividing-and-
conquering
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[chunk_length>1](); //if-else
}
return true;
};
std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl;
//Obtain position of element (to verify parallel search finds correct position).
auto pos = std::find(my_array.begin(), my_array.end(), val);
//Ordinary binary search for timing comparison
std::cout << std::endl << "=========================================================================
=====" << std::endl;
std::cout << "Running binary search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" << std
::endl;
unsigned factor = 10000; //number of times to run search
auto start_binary = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); //
binary search
VTUNE(__itt_pause();)
auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer
//print out results of binary search
using std::chrono::duration_cast;
using std::chrono::nanoseconds;
std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock::
period::num) << " ns" << std::endl;
std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ",
index=" << pos - my_array.begin()
<< ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast
<nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl;
//Parallel searches
//SIMD search
if (is_simd)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
//Kick off the search
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "simd_search()",
for (unsigned i = 0; i<factor; i++) simd_search();
)
//Wait for result and then get the intertion point and number of paasses
12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count();
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Simd results:" << std::endl;
std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl;
std::cout << "Search repeated " << factor << " times" << std::endl;
std::cout << "number of threads=" << running_threads << std::endl;
std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," <<
ret_val[1] << "]";
std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl;
std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl;
std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time
<< "ns";
std::cout << " = ";
std::cout << parallel_time / binary_time << std::endl;
std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::
endl;
}
//Lock-free multithreaded search
if (is_lock_free)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads };
done = false;
//Kick off the search
//auto start = std::chrono::high_resolution_clock::now(); //start timer
std::vector<std::future<bool>> futures;
go = false;
futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0)));
for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos)
{
futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos)));
}
//Wait for result and then get the intertion point and number of paasses
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "lock_free()",
go = true; futures[0].get();
)
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
}
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Lock free results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
if (is_lock_based)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
//Kick off the search
//auto start = std::chrono::high_resolution_clock::now(); //start timer
go = false;
auto f = thread_pool.push(lock_based, &my_array[0], size);
//Wait for result and then get the intertion point and number of paasses
{
//wait for completion
std::mutex dummy;
std::unique_lock<std::mutex> lock(dummy);
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "lock_based()",
go = true;
finished.wait(lock);
)
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
thread_pool.cancel_tasks();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
//kill thread pool
thread_pool.shutdown(false);
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Lock based results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
}
catch (...)
{
std::cout << "ParallelSearch exception" << std::endl;
}
};
int main(void)
{
//SimpleTest simple_test;
VTUNE(__itt_pause();)
ParallelSearch parallel_search;
char c;
std::cout << "Press any key to exit" << std::endl;
std::cin >> c; //keep console alive
}

More Related Content

What's hot (19)

PDF
Rooted 2010 ppp
noc_313
 
TXT
C99
scriptexe
 
PDF
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Alex Soto
 
PDF
Teaching Your Machine To Find Fraudsters
Ian Barber
 
TXT
zinno
guest6a7933
 
PDF
Debugging: Rules And Tools - PHPTek 11 Version
Ian Barber
 
KEY
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Masahiro Nagano
 
TXT
Cod
Stan Adrian
 
PDF
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Mail.ru Group
 
PDF
"let ECMAScript = 6"
The Software House
 
PDF
PHP Static Code Review
Damien Seguy
 
PDF
Your code is not a string
Ingvar Stepanyan
 
KEY
Php 101: PDO
Jeremy Kendall
 
PDF
Drush. Secrets come out.
Alex S
 
PPTX
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
doughellmann
 
PDF
Twitter codeigniter library
Navaneeswar Reddy
 
PDF
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
NoSQLmatters
 
PDF
PHP Data Objects
Wez Furlong
 
Rooted 2010 ppp
noc_313
 
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Alex Soto
 
Teaching Your Machine To Find Fraudsters
Ian Barber
 
Debugging: Rules And Tools - PHPTek 11 Version
Ian Barber
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Masahiro Nagano
 
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Mail.ru Group
 
"let ECMAScript = 6"
The Software House
 
PHP Static Code Review
Damien Seguy
 
Your code is not a string
Ingvar Stepanyan
 
Php 101: PDO
Jeremy Kendall
 
Drush. Secrets come out.
Alex S
 
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
doughellmann
 
Twitter codeigniter library
Navaneeswar Reddy
 
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
NoSQLmatters
 
PHP Data Objects
Wez Furlong
 

Viewers also liked (7)

PDF
Shared_memory_hash_table
Russell Childs
 
PDF
Algorithms devised for a google interview
Russell Childs
 
PDF
Dynamic programming burglar_problem
Russell Childs
 
PDF
Cpp11 sample linux
Russell Childs
 
PDF
Simple shared mutex UML
Russell Childs
 
PDF
Full resume dr_russell_john_childs_2016
Russell Childs
 
PDF
Interview C++11 code
Russell Childs
 
Shared_memory_hash_table
Russell Childs
 
Algorithms devised for a google interview
Russell Childs
 
Dynamic programming burglar_problem
Russell Childs
 
Cpp11 sample linux
Russell Childs
 
Simple shared mutex UML
Russell Childs
 
Full resume dr_russell_john_childs_2016
Russell Childs
 
Interview C++11 code
Russell Childs
 
Ad

Similar to Cpp11 multithreading and_simd_linux_code (20)

PDF
GPU Programming on CPU - Using C++AMP
Miller Lee
 
PPTX
Medical Image Processing Strategies for multi-core CPUs
Daniel Blezek
 
PPTX
defense-linkedin
Dr. Spiros N. Agathos
 
PPTX
Threads and multi threading
Antonio Cesarano
 
PPTX
Robust C++ Task Systems Through Compile-time Checks
Stoyan Nikolov
 
PDF
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
PROIDEA
 
PDF
Project02 wit
PeaceFul Tranquility
 
PPTX
17. thread and deadlock
Vahid Heidari
 
PPT
CS4961-L9.ppt
MarlonMagtibay2
 
PDF
Unmanaged Parallelization via P/Invoke
Dmitri Nesteruk
 
PPTX
Embedded_ PPT_4-5 unit_Dr Monika-edited.pptx
ProfMonikaJain
 
PPTX
Computer Architecture and Organization
ssuserdfc773
 
PPT
Threaded Programming
Sri Prasanna
 
PDF
Better Code: Concurrency
Platonov Sergey
 
PDF
Need help implementing the skeleton code below, I have provided the .pdf
ezzi552
 
PDF
OpenMP tasking model: from the standard to the classroom
Facultad de Informática UCM
 
PPTX
CS345 09 - Ch04 Threads operating system1.pptx
RichaAgnihotri13
 
PPTX
Putting a Fork in Fork (Linux Process and Memory Management)
David Evans
 
PDF
40d5984d819aaa72e55aa10376b73bde_MIT6_087IAP10_lec12.pdf
SagarYadav642223
 
PDF
GTC16 - S6510 - Targeting GPUs with OpenMP 4.5
Jeff Larkin
 
GPU Programming on CPU - Using C++AMP
Miller Lee
 
Medical Image Processing Strategies for multi-core CPUs
Daniel Blezek
 
defense-linkedin
Dr. Spiros N. Agathos
 
Threads and multi threading
Antonio Cesarano
 
Robust C++ Task Systems Through Compile-time Checks
Stoyan Nikolov
 
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
PROIDEA
 
Project02 wit
PeaceFul Tranquility
 
17. thread and deadlock
Vahid Heidari
 
CS4961-L9.ppt
MarlonMagtibay2
 
Unmanaged Parallelization via P/Invoke
Dmitri Nesteruk
 
Embedded_ PPT_4-5 unit_Dr Monika-edited.pptx
ProfMonikaJain
 
Computer Architecture and Organization
ssuserdfc773
 
Threaded Programming
Sri Prasanna
 
Better Code: Concurrency
Platonov Sergey
 
Need help implementing the skeleton code below, I have provided the .pdf
ezzi552
 
OpenMP tasking model: from the standard to the classroom
Facultad de Informática UCM
 
CS345 09 - Ch04 Threads operating system1.pptx
RichaAgnihotri13
 
Putting a Fork in Fork (Linux Process and Memory Management)
David Evans
 
40d5984d819aaa72e55aa10376b73bde_MIT6_087IAP10_lec12.pdf
SagarYadav642223
 
GTC16 - S6510 - Targeting GPUs with OpenMP 4.5
Jeff Larkin
 
Ad

More from Russell Childs (20)

PDF
spinor_quantum_simulator_user_guide_.pdf
Russell Childs
 
PDF
String searching o_n
Russell Childs
 
PDF
String searching o_n
Russell Childs
 
PDF
String searching o_n
Russell Childs
 
PDF
String searching
Russell Childs
 
PDF
Permute
Russell Childs
 
PDF
Permute
Russell Childs
 
PDF
Feature extraction using adiabatic theorem
Russell Childs
 
PDF
Feature extraction using adiabatic theorem
Russell Childs
 
PDF
Wavelets_and_multiresolution_in_two_pages
Russell Childs
 
PDF
Relativity 2
Russell Childs
 
PDF
Recursion to iteration automation.
Russell Childs
 
PDF
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Russell Childs
 
PDF
Design pattern to avoid downcasting
Russell Childs
 
PDF
Interview uml design
Russell Childs
 
PDF
Full_resume_Dr_Russell_John_Childs
Russell Childs
 
PDF
K d tree_cpp
Russell Childs
 
PDF
Multithreaded sockets c++11
Russell Childs
 
PDF
IBM Kinexa Prove It! C programming test results.
Russell Childs
 
PDF
IBM Kinexa Prove It! C++ programming test results.
Russell Childs
 
spinor_quantum_simulator_user_guide_.pdf
Russell Childs
 
String searching o_n
Russell Childs
 
String searching o_n
Russell Childs
 
String searching o_n
Russell Childs
 
String searching
Russell Childs
 
Feature extraction using adiabatic theorem
Russell Childs
 
Feature extraction using adiabatic theorem
Russell Childs
 
Wavelets_and_multiresolution_in_two_pages
Russell Childs
 
Relativity 2
Russell Childs
 
Recursion to iteration automation.
Russell Childs
 
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Russell Childs
 
Design pattern to avoid downcasting
Russell Childs
 
Interview uml design
Russell Childs
 
Full_resume_Dr_Russell_John_Childs
Russell Childs
 
K d tree_cpp
Russell Childs
 
Multithreaded sockets c++11
Russell Childs
 
IBM Kinexa Prove It! C programming test results.
Russell Childs
 
IBM Kinexa Prove It! C++ programming test results.
Russell Childs
 

Recently uploaded (20)

PDF
Mastering Financial Management in Direct Selling
Epixel MLM Software
 
PDF
The 2025 InfraRed Report - Redpoint Ventures
Razin Mustafiz
 
PPTX
Mastering ODC + Okta Configuration - Chennai OSUG
HathiMaryA
 
PDF
Reverse Engineering of Security Products: Developing an Advanced Microsoft De...
nwbxhhcyjv
 
PPTX
Designing_the_Future_AI_Driven_Product_Experiences_Across_Devices.pptx
presentifyai
 
PPTX
Agentforce World Tour Toronto '25 - Supercharge MuleSoft Development with Mod...
Alexandra N. Martinez
 
PDF
“Voice Interfaces on a Budget: Building Real-time Speech Recognition on Low-c...
Edge AI and Vision Alliance
 
PDF
Future-Proof or Fall Behind? 10 Tech Trends You Can’t Afford to Ignore in 2025
DIGITALCONFEX
 
PDF
Newgen Beyond Frankenstein_Build vs Buy_Digital_version.pdf
darshakparmar
 
PDF
Peak of Data & AI Encore AI-Enhanced Workflows for the Real World
Safe Software
 
PDF
SIZING YOUR AIR CONDITIONER---A PRACTICAL GUIDE.pdf
Muhammad Rizwan Akram
 
PPTX
COMPARISON OF RASTER ANALYSIS TOOLS OF QGIS AND ARCGIS
Sharanya Sarkar
 
PPTX
Digital Circuits, important subject in CS
contactparinay1
 
PPTX
New ThousandEyes Product Innovations: Cisco Live June 2025
ThousandEyes
 
PDF
UPDF - AI PDF Editor & Converter Key Features
DealFuel
 
PDF
AI Agents in the Cloud: The Rise of Agentic Cloud Architecture
Lilly Gracia
 
PDF
Staying Human in a Machine- Accelerated World
Catalin Jora
 
PDF
NLJUG Speaker academy 2025 - first session
Bert Jan Schrijver
 
PDF
LOOPS in C Programming Language - Technology
RishabhDwivedi43
 
PDF
What’s my job again? Slides from Mark Simos talk at 2025 Tampa BSides
Mark Simos
 
Mastering Financial Management in Direct Selling
Epixel MLM Software
 
The 2025 InfraRed Report - Redpoint Ventures
Razin Mustafiz
 
Mastering ODC + Okta Configuration - Chennai OSUG
HathiMaryA
 
Reverse Engineering of Security Products: Developing an Advanced Microsoft De...
nwbxhhcyjv
 
Designing_the_Future_AI_Driven_Product_Experiences_Across_Devices.pptx
presentifyai
 
Agentforce World Tour Toronto '25 - Supercharge MuleSoft Development with Mod...
Alexandra N. Martinez
 
“Voice Interfaces on a Budget: Building Real-time Speech Recognition on Low-c...
Edge AI and Vision Alliance
 
Future-Proof or Fall Behind? 10 Tech Trends You Can’t Afford to Ignore in 2025
DIGITALCONFEX
 
Newgen Beyond Frankenstein_Build vs Buy_Digital_version.pdf
darshakparmar
 
Peak of Data & AI Encore AI-Enhanced Workflows for the Real World
Safe Software
 
SIZING YOUR AIR CONDITIONER---A PRACTICAL GUIDE.pdf
Muhammad Rizwan Akram
 
COMPARISON OF RASTER ANALYSIS TOOLS OF QGIS AND ARCGIS
Sharanya Sarkar
 
Digital Circuits, important subject in CS
contactparinay1
 
New ThousandEyes Product Innovations: Cisco Live June 2025
ThousandEyes
 
UPDF - AI PDF Editor & Converter Key Features
DealFuel
 
AI Agents in the Cloud: The Rise of Agentic Cloud Architecture
Lilly Gracia
 
Staying Human in a Machine- Accelerated World
Catalin Jora
 
NLJUG Speaker academy 2025 - first session
Bert Jan Schrijver
 
LOOPS in C Programming Language - Technology
RishabhDwivedi43
 
What’s my job again? Slides from Mark Simos talk at 2025 Tampa BSides
Mark Simos
 

Cpp11 multithreading and_simd_linux_code

  • 1. 1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp /* * multi.cpp * * Created on: 13 Mar 2014 * Author: Russell John Childs. */ //======================================================================================================= // COPYRIGHT NOTICE // This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been // distributed as a representative example of my use of C++11 features. //========================================================================================================== ============ //==================================================== // File contains // (1) Implementation of lock-based thread pool. // (2) Implementation of lock-free "thread pool". // (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD parallel for): // 1) Split sorted array into <num_threads> equal chunks // 2) Assign each chunk to a thread. // 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end() // 4) Replace array with chunk that returned true and return to step 1 // Complexity: // t - number of threads ( > 1 ) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Compiling this code sample (Linux Mint - g++ 4.8) // // Compiler options: // g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer // --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include / // multithreading.cpp // // Linker options: // g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $ (LIBS) -lpthread -latomic -littnotify -ldl // //============================================================== #include <thread> #include <future> #include <condition_variable> #include <atomic> #include <functional> #include <deque> #include <vector> #include <set> #include <iostream> #include <string> #include <sstream> #include <cmath> #include <algorithm> #include <omp.h> #include <immintrin.h> //#include <cilk/cilk.h> //Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler. #define INTEL_NO_ITTNOTIFY_API //include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined #ifndef INTEL_NO_ITTNOTIFY_API #include "ittnotify.h" #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE(STATEMENT) #else
  • 2. 2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp #define VTUNE(STATEMENT) STATEMENT #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS #else #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) { auto domain = __itt_domain_create(DOMAIN); __itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); STATEMENTS __itt_task_end(domain); } #endif // ================================================================= // Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ... // storable in STL container for thread pool. // === // N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type (Type)>(type), // since packaged_task has void operator()(void). However, there is a problem: std::function // requires command object to be copyable and packaged_task has move-only semantics. //================================================================== //============================ //Primary template //============================ template< typename Out = void, typename In = void > struct MyPackagedTask { virtual ~MyPackagedTask(void) { } }; //============================ //Explicit specialization, acts as base class // MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>; // poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op() //============================ template<> struct MyPackagedTask<> { virtual ~MyPackagedTask(void) { } virtual void operator()(void) { } }; std::mutex last_return_mutex; //============================ //Specialization for function signature // MyPackagedTaks<MyType(OtherType)> //============================ template< typename Out, typename... In > struct MyPackagedTask< Out(In...) > : public MyPackagedTask<> { MyPackagedTask(std::function<Out(In...)> func, In... in) : m_task(std::bind(func, in...)) { } virtual ~MyPackagedTask(void) { } MyPackagedTask(MyPackagedTask&& other) : m_task(std::move(other.m_task))
  • 3. 3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { } void operator()(void) { m_task(); } std::future<Out> get_future(void) { return m_task.get_future(); } private: std::packaged_task<Out(void)> m_task; }; //====================================================================== // Simple thread pool class // Places tasks onto common queue // Allocates fixed number of threads which pop tasks. // TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation. //==================================================================== class ThreadPool { public: ThreadPool(unsigned max_num_threads = 1U << 31) : m_done(false), //notice to threads to shut down m_print_shutdown_msg(true), //print or not print shutdown msg m_max_num_threads(max_num_threads), //maximum num threads allowed in pool m_num_threads(0), //num threads allocated by the pool m_processing(0), //tasks still running m_cancel(false) { } ~ThreadPool(void) { //Shut down threads iff user has not alread called shutdown() if (!m_done) { shutdown(); } } //================= // Push task onto pool //================ template< typename Out, typename... In > std::future<Out> push(std::function<Out(In...)> func, In... in) { //Create task, store future MyPackagedTask<Out(In...)> task(func, in...); std::future<Out> ret_val = task.get_future(); //lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release lock if (m_cancel == false) { { std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); // Base*=&Derived for poly call std::lock_guard<std::mutex> lock(m_tasks); //lock queue m_pool.push_back(std::move(ptr)); //push task } //release lock m_condition_variable.notify_all(); //notify waiting threads //spawn a thread(async will prevent oversubscription) and store thread future(to check for thread termination at pool shutdown) if ((++m_num_threads <= m_max_num_threads)) { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this)); }
  • 4. 4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp } //return packaged_task future so that caller can wait for result return ret_val; } //================= // get number of threads allocated //================ unsigned get_num_threads(void) { std::unique_lock<std::mutex> lock(m_threads); return m_thread_list.size(); } //================= // Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest tested. //================ void cancel_tasks(void) { m_cancel = true; while (m_processing != 0); { std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_pool.clear(); } m_cancel = false; } //================= // Kill all threads and print out shutdown message (iff msg==true) //================ void shutdown(bool msg = true) { m_print_shutdown_msg = msg; { if (m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "================================================================= " << std::endl << "Shutting down threads: "; } } cancel_tasks(); //Notify all threads of thread pool termination m_done = true; m_condition_variable.notify_all(); //Loop over all threads and wait for them to terminate { std::unique_lock<std::mutex> lock(m_threads); for (auto& elem : m_thread_list) { while (!elem.valid()); elem.get(); } } //Clear thread queue { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.clear(); } //Print out shutdown message if (m_print_shutdown_msg)
  • 5. 5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "=================================================================" << std::endl; } } private: //================= // Pop and run tasks in threads. //================ void run_tasks(void) { //To avoid branch misprediction, use array to store branch code instead of if-else std::unique_ptr<MyPackagedTask<>> func; std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front(); }; std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new MyPackagedTask<>); };//NOP std::function<void(void)> switch_func[2]{ branch_false, branch_true}; while (!m_done) { // Only wait if there are still tasks to be processed { bool empty; //Status of task queue std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty = m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown. } ++m_processing; (*func)(); --m_processing; } //Print out shutdown msg if (m_done & m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::this_thread::get_id() << " "; } } std::atomic<bool> m_done; std::atomic<bool> m_print_shutdown_msg; std::atomic<unsigned> m_max_num_threads; std::atomic<unsigned> m_num_threads; std::atomic<unsigned> m_processing; std::atomic<bool> m_cancel; std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool; std::vector< std::future<void> > m_thread_list; std::mutex m_threads; std::mutex m_tasks; std::mutex m_shutdown; std::condition_variable m_condition_variable; }; //===================================== // Simple test class // Creates a few tasks, pushes them onto thread pool, gets results //================================================================== struct SimpleTest { SimpleTest(void) try { std::cout << std::endl << "Simple Test......" << std::endl << std::endl; //Create thread pool ThreadPool thread_pool;
  • 6. 6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //create a task std::thread::id f1_id; std::function< int(int, int) > f1 = [&](int i, int j) { f1_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return i*j; }; //create another task std::thread::id f2_id; std::function< std::string(void) > f2 = [&](void) { f2_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return std::string("return value of f2"); }; //create another task std::thread::id f3_id; std::string f3_str; std::function< void(void) > f3 = [&](void) { f3_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); f3_str = "f3 called"; }; //push tasks auto start = std::chrono::high_resolution_clock::now(); //start timer std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20))); std::future<std::string> fut_2 = thread_pool.push(f2); int fut_1_res = fut_1.get(); std::string fut_2_res = fut_2.get(); auto end = std::chrono::high_resolution_clock::now(); //stop timer //std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error. // std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future <void> //std::future<void> test_fut; //compiles //std::future<void> test_fut1 = std::move(test_fut); //compiles //thread_pool.push(f3); // doesn't compile // std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles // thread_pool.push(f4, 2); //compiles // std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles // thread_pool.push(f4, 2); //doesn't compile //print num of threads running, thread id for tasks, result sent back by tasks std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl; std::cout << "f1 thread id=" << f1_id << std::endl; std::cout << "f1's result: " << fut_1_res << std::endl; std::cout << "f2 thread id=" << f2_id << std::endl; std::cout << "f2's result: " << fut_2_res << std::endl; //std::cout << "f3 thread id=" << f3_id << std::endl; //std::cout << "f3's result: " << f3_str << std::endl; std::cout << "thread_pool time = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std:: endl; //cleanup threads //thread_pool.shutdown(); test dtor } catch (...) { std::cout << "SimpleTest exception" << std::endl; } }; //============================================================== // Parallel vs binary search test class // t - number of threads ( > 1) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Binary = std::find (single threaded for comparison).
  • 7. 7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp // Parallel: (1) Split array into equal chuncks, push them onto thread pool // (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val <= end(). // (3) Chunk returning true replaces array and step(1) repeated. // N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match // Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is quicker if early match found). // Benchmarks show high overhead of thread pool. //=============================================================== struct ParallelSearch { //Choose which to run bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_simd = true; //run simd-based lambda //Choose number of threads //unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead) //const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription, should run slower than optimal const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal choice //const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate oversubsrciption, should run slower tha optimal //const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy oversubsrciption, should run slower tha optimal //const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive oversubsrciption, should run slower tha optimal ParallelSearch(void) try { std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion //Create large, sorted array on heap to avoid seg fault. const unsigned size = 2 << 24; std::vector<unsigned> my_array(size); for (auto& elem : my_array) { static unsigned i = 0; elem = 2 * i; //even numbers ++i; } //double-word atomic containing the address of a matching chunk and the new new chunk length (size, size/t, size/t^2 ...) struct DoubleWord { unsigned* m_address; unsigned m_chunk_length; }; std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads }); //val seacrched for (TODO: binary search faster than parallel search if binary finds early match. Need to terminate parallel search earlier) bool even = true; unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); // even/odd number --> found/not found //Variables for found position, passes taken and whether to printout progress(incurs overhead) unsigned* ret_val = &my_array[0]; int passes = 0; //int required by g++ autovectorize bool printout = false; //SIMD lambda (Proved to be quite difficult getting g++ to autovectorise) //(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // 1. Split array into t chunks // 2. Allocate chunks to t SIMD lanes // 3. Each lane checks chunk.begin() <= search-val <= chunk.end() // 4. The SIMD lane getting a match set array = chunk // 5. Steps 1 to 4 repeated until chunk is 1 element long.
  • 8. 8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp std::function<bool(void)> simd_search = [&]() { //Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register) const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi ?id=56787) //Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s)) alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15, size >> 18, size >> 21, size >> 24, 1}; //Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8, lower[n]/64 ... alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size, 7 * size}; //Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2}; //Running tally of start of chunk to be searched alignas(alignment) int offset = 0; alignas(alignment) int tmp_offset = 0; //Loop until chunk length is 1 element for (passes = 0; passes<9; ++passes) { //Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are due to memory stalls. //It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this search algorithm is unavoidably //memory-bound unless something along the lines of a heap-ordered array (i.e array is laid out as a breadth-first n-ary tree) is //used to convert random access to linear access without need for scatter-gather. //#pragma omp parallel for //Adds too much overhead [&]() //Sadly, won't vectorise due to function call { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data they will use { int tmp = pos*chunk_length[passes]; //Get lower index for chunk interval __builtin_prefetch(&my_array[0] + offset + tmp); //See if it removes hotspot from "LINE X", below __builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if it removes hotspot from "LINE Y", below } }(); //Fork: Assign each chunk to an SIMD lane //N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not vectorised. This does autovectorise under g++ 4.8 //N.B. Code has been borken down into painfully simple steps to help autovectoriser and pinpoint which operations are causing trouble [&]() { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. alignas(alignment) int chunk = chunk_length[passes]; for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes { //Find matching chunk by adding 0 to offset for no-match and chunk address for a match alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset //int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi? id=56787). Can't use 32-byte AVX. alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk range unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
  • 9. 9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp prefetch) unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above prefecth) alignas(alignment) bool test_lower = lower_val <= val; //Lower alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check alignas(alignment) bool test = test_lower && test_upper; // is search-val inside chunk for this SIMD lane? tmp_offset += test*tmp; //Horrible construct to get it to autovec. It masks out SIMD lanes that don't contain search val. //Following fails because it is "not suitable for gather" (whatever that means) //offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+ chunk_length[passes]-1]))*tmp; //Following fails because of "control flow" (Can't see why g++ doesn't autovec it, control flow can be reaplced with masked op) //if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length [passes]-1])) tmp_offset = tmp; } }(); //Join: end of SIMD //Update chunk start address index offset = tmp_offset; /*std::cout << "offset=" << offset << std::endl; std::cout << "passes=" << passes; std::cout << ", val=" << val; std::cout << ", range=[" << array[offset] << "," << array[offset+1]; std::cout<< ", chunk length=" << chunk_length[passes] << std::endl; */ } //Update final index of search-val ret_val = &my_array[0] + offset; return true; }; //Lock-free lambda for each thead //Operation: //1. The array is split into t (num of threads) chunks //2. Each thread examines its chunk //3. If a match is found in a chunk, the thread changes the array to be that chunk. //4. The process repeated from step 1. //t threads continous monitor the array and process their chunk of the array. Since the array pointer is // atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is picked up // by all threads. No synchronisation is needed. // arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+ chunk_length/t, begin+2*chunk_length/t], ..) std::atomic<unsigned> running_threads(0); std::atomic<bool> go(false); std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos) { //Increment running thread count ++running_threads; //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Keep searching until a thread notifies completion. while (!done) { //capture chunk address and length DoubleWord capture = chunk_address_and_length; //Check if search-val between chunk.begin() and chunk.end() unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length; unsigned *end = begin + capture.m_chunk_length - 1; unsigned test1 = *begin, test2 = *end; if (*begin <= val && val <= *end)
  • 10. 10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { //Print out iterations (adds significant overhead) static std::mutex printout_mutex; if (printout) { std::unique_lock<std::mutex> lock(printout_mutex); //Print out iterations (adds significant overhead) std::cout << "Parallel find (pass " << passes << "): Closest match " << *begin << "<=" << val << "<=" << *end << ", chunk length=" << capture.m_chunk_length << std::endl; } //Update parent variables for printouts ret_val = begin; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads) std::function<void(void)> branch_true = [&]() //IF { //Update chunk length and address capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture. m_chunk_length / num_threads) : 1); //divide chunk evenly capture.m_address = begin; //point to this chunk chunk_address_and_length = capture; }; std::function<void(void)> branch_false = [&]() //ELSE { done = true; //notify parent and sister threads of completion }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[capture.m_chunk_length > 1](); //if-else } else { std::this_thread::sleep_for(std::chrono::nanoseconds(5000)); } } return true;; }; //Create thread pool for lock-based search static ThreadPool thread_pool(num_threads); //Notification of completion of lock-based search std::condition_variable finished; // lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos + chuhnk_length] // and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion point. std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) // { //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Check if search-val between chunk.begin() and chunk.end() if (*tmp <= val && val <= *(tmp + chunk_length - 1)) { //Print out iterations (adds significant overhead) if (printout) { std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" << val << "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp + 1)) << ", chunk length=" << chunk_length << std::endl; } //Update parent variables for printouts ret_val = tmp; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads)
  • 11. 11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //Spawn new tasks to process this chunk //Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to replace if-else //need VTune to test out whether it saves us any mispredictions. std::function<void(void)> branch_true = [&]() //IF { chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1); //divide chunk evenly for (unsigned index = 0; index < num_threads; ++index) { thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length); } }; std::function<void(void)> branch_false = [&]() //ELSE { finished.notify_one(); //chunk length is 1, so we are finished dividing-and- conquering }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[chunk_length>1](); //if-else } return true; }; std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl; //Obtain position of element (to verify parallel search finds correct position). auto pos = std::find(my_array.begin(), my_array.end(), val); //Ordinary binary search for timing comparison std::cout << std::endl << "========================================================================= =====" << std::endl; std::cout << "Running binary search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std ::endl; unsigned factor = 10000; //number of times to run search auto start_binary = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); // binary search VTUNE(__itt_pause();) auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer //print out results of binary search using std::chrono::duration_cast; using std::chrono::nanoseconds; std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock:: period::num) << " ns" << std::endl; std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ", index=" << pos - my_array.begin() << ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast <nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl; //Parallel searches //SIMD search if (is_simd) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; //Kick off the search auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "simd_search()", for (unsigned i = 0; i<factor; i++) simd_search(); ) //Wait for result and then get the intertion point and number of paasses
  • 12. 12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count(); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Simd results:" << std::endl; std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl; std::cout << "Search repeated " << factor << " times" << std::endl; std::cout << "number of threads=" << running_threads << std::endl; std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] << "]"; std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl; std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl; std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns"; std::cout << " = "; std::cout << parallel_time / binary_time << std::endl; std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std:: endl; } //Lock-free multithreaded search if (is_lock_free) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads }; done = false; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer std::vector<std::future<bool>> futures; go = false; futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0))); for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos) { futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos))); } //Wait for result and then get the intertion point and number of paasses auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_free()", go = true; futures[0].get(); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); } //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock free results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl
  • 13. 13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = " << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } if (is_lock_based) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer go = false; auto f = thread_pool.push(lock_based, &my_array[0], size); //Wait for result and then get the intertion point and number of paasses { //wait for completion std::mutex dummy; std::unique_lock<std::mutex> lock(dummy); auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_based()", go = true; finished.wait(lock); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); thread_pool.cancel_tasks(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } //kill thread pool thread_pool.shutdown(false); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock based results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = "
  • 14. 14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } } catch (...) { std::cout << "ParallelSearch exception" << std::endl; } }; int main(void) { //SimpleTest simple_test; VTUNE(__itt_pause();) ParallelSearch parallel_search; char c; std::cout << "Press any key to exit" << std::endl; std::cin >> c; //keep console alive }