SlideShare a Scribd company logo
1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
/*
* multi.cpp
*
* Created on: 13 Mar 2014
* Author: Russell John Childs.
*/
//=======================================================================================================
// COPYRIGHT NOTICE
// This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been
// distributed as a representative example of my use of C++11 features.
//==========================================================================================================
============
//====================================================
// File contains
// (1) Implementation of lock-based thread pool.
// (2) Implementation of lock-free "thread pool".
// (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge
processor)
// Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD
parallel for):
// 1) Split sorted array into <num_threads> equal chunks
// 2) Assign each chunk to a thread.
// 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end()
// 4) Replace array with chunk that returned true and return to step 1
// Complexity:
// t - number of threads ( > 1 )
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Compiling this code sample (Linux Mint - g++ 4.8)
//
// Compiler options:
// g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer
// --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include
/
// multithreading.cpp
//
// Linker options:
// g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $
(LIBS) -lpthread -latomic -littnotify -ldl
//
//==============================================================
#include <thread>
#include <future>
#include <condition_variable>
#include <atomic>
#include <functional>
#include <deque>
#include <vector>
#include <set>
#include <iostream>
#include <string>
#include <sstream>
#include <cmath>
#include <algorithm>
#include <omp.h>
#include <immintrin.h>
//#include <cilk/cilk.h>
//Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler.
#define INTEL_NO_ITTNOTIFY_API
//include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifndef INTEL_NO_ITTNOTIFY_API
#include "ittnotify.h"
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE(STATEMENT)
#else
2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
#define VTUNE(STATEMENT) STATEMENT
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS
#else
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) 
{ 
auto domain = __itt_domain_create(DOMAIN); 
__itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); 
STATEMENTS 
__itt_task_end(domain); 
}
#endif
// =================================================================
// Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ...
// storable in STL container for thread pool.
// ===
// N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type
(Type)>(type),
// since packaged_task has void operator()(void). However, there is a problem: std::function
// requires command object to be copyable and packaged_task has move-only semantics.
//==================================================================
//============================
//Primary template
//============================
template< typename Out = void, typename In = void >
struct MyPackagedTask
{
virtual ~MyPackagedTask(void)
{
}
};
//============================
//Explicit specialization, acts as base class
// MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>;
// poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op()
//============================
template<>
struct MyPackagedTask<>
{
virtual ~MyPackagedTask(void)
{
}
virtual void operator()(void)
{
}
};
std::mutex last_return_mutex;
//============================
//Specialization for function signature
// MyPackagedTaks<MyType(OtherType)>
//============================
template< typename Out, typename... In >
struct MyPackagedTask< Out(In...) > : public MyPackagedTask<>
{
MyPackagedTask(std::function<Out(In...)> func, In... in) :
m_task(std::bind(func, in...))
{
}
virtual ~MyPackagedTask(void)
{
}
MyPackagedTask(MyPackagedTask&& other) :
m_task(std::move(other.m_task))
3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
}
void operator()(void)
{
m_task();
}
std::future<Out> get_future(void)
{
return m_task.get_future();
}
private:
std::packaged_task<Out(void)> m_task;
};
//======================================================================
// Simple thread pool class
// Places tasks onto common queue
// Allocates fixed number of threads which pop tasks.
// TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation.
//====================================================================
class ThreadPool
{
public:
ThreadPool(unsigned max_num_threads = 1U << 31) :
m_done(false), //notice to threads to shut down
m_print_shutdown_msg(true), //print or not print shutdown msg
m_max_num_threads(max_num_threads), //maximum num threads allowed in pool
m_num_threads(0), //num threads allocated by the pool
m_processing(0), //tasks still running
m_cancel(false)
{
}
~ThreadPool(void)
{
//Shut down threads iff user has not alread called shutdown()
if (!m_done)
{
shutdown();
}
}
//=================
// Push task onto pool
//================
template< typename Out, typename... In >
std::future<Out> push(std::function<Out(In...)> func, In... in)
{
//Create task, store future
MyPackagedTask<Out(In...)> task(func, in...);
std::future<Out> ret_val = task.get_future();
//lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release
lock
if (m_cancel == false)
{
{
std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); //
Base*=&Derived for poly call
std::lock_guard<std::mutex> lock(m_tasks); //lock queue
m_pool.push_back(std::move(ptr)); //push task
} //release lock
m_condition_variable.notify_all(); //notify waiting threads
//spawn a thread(async will prevent oversubscription) and store thread future(to check for
thread termination at pool shutdown)
if ((++m_num_threads <= m_max_num_threads))
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this));
}
4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
}
//return packaged_task future so that caller can wait for result
return ret_val;
}
//=================
// get number of threads allocated
//================
unsigned get_num_threads(void)
{
std::unique_lock<std::mutex> lock(m_threads);
return m_thread_list.size();
}
//=================
// Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest
tested.
//================
void cancel_tasks(void)
{
m_cancel = true;
while (m_processing != 0);
{
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_pool.clear();
}
m_cancel = false;
}
//=================
// Kill all threads and print out shutdown message (iff msg==true)
//================
void shutdown(bool msg = true)
{
m_print_shutdown_msg = msg;
{
if (m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================
" << std::endl
<< "Shutting down threads: ";
}
}
cancel_tasks();
//Notify all threads of thread pool termination
m_done = true;
m_condition_variable.notify_all();
//Loop over all threads and wait for them to terminate
{
std::unique_lock<std::mutex> lock(m_threads);
for (auto& elem : m_thread_list)
{
while (!elem.valid());
elem.get();
}
}
//Clear thread queue
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.clear();
}
//Print out shutdown message
if (m_print_shutdown_msg)
5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================" <<
std::endl;
}
}
private:
//=================
// Pop and run tasks in threads.
//================
void run_tasks(void)
{
//To avoid branch misprediction, use array to store branch code instead of if-else
std::unique_ptr<MyPackagedTask<>> func;
std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front();
};
std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new
MyPackagedTask<>); };//NOP
std::function<void(void)> switch_func[2]{ branch_false, branch_true};
while (!m_done)
{
// Only wait if there are still tasks to be processed
{
bool empty; //Status of task queue
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty =
m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown
switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown.
}
++m_processing;
(*func)();
--m_processing;
}
//Print out shutdown msg
if (m_done & m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::this_thread::get_id() << " ";
}
}
std::atomic<bool> m_done;
std::atomic<bool> m_print_shutdown_msg;
std::atomic<unsigned> m_max_num_threads;
std::atomic<unsigned> m_num_threads;
std::atomic<unsigned> m_processing;
std::atomic<bool> m_cancel;
std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool;
std::vector< std::future<void> > m_thread_list;
std::mutex m_threads;
std::mutex m_tasks;
std::mutex m_shutdown;
std::condition_variable m_condition_variable;
};
//=====================================
// Simple test class
// Creates a few tasks, pushes them onto thread pool, gets results
//==================================================================
struct SimpleTest
{
SimpleTest(void) try
{
std::cout << std::endl << "Simple Test......" << std::endl << std::endl;
//Create thread pool
ThreadPool thread_pool;
6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
//create a task
std::thread::id f1_id;
std::function< int(int, int) > f1 = [&](int i, int j)
{
f1_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return i*j;
};
//create another task
std::thread::id f2_id;
std::function< std::string(void) > f2 = [&](void)
{
f2_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return std::string("return value of f2");
};
//create another task
std::thread::id f3_id;
std::string f3_str;
std::function< void(void) > f3 = [&](void)
{
f3_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
f3_str = "f3 called";
};
//push tasks
auto start = std::chrono::high_resolution_clock::now(); //start timer
std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20)));
std::future<std::string> fut_2 = thread_pool.push(f2);
int fut_1_res = fut_1.get();
std::string fut_2_res = fut_2.get();
auto end = std::chrono::high_resolution_clock::now(); //stop timer
//std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error.
// std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future
<void>
//std::future<void> test_fut; //compiles
//std::future<void> test_fut1 = std::move(test_fut); //compiles
//thread_pool.push(f3); // doesn't compile
// std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles
// thread_pool.push(f4, 2); //compiles
// std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles
// thread_pool.push(f4, 2); //doesn't compile
//print num of threads running, thread id for tasks, result sent back by tasks
std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl;
std::cout << "f1 thread id=" << f1_id << std::endl;
std::cout << "f1's result: " << fut_1_res << std::endl;
std::cout << "f2 thread id=" << f2_id << std::endl;
std::cout << "f2's result: " << fut_2_res << std::endl;
//std::cout << "f3 thread id=" << f3_id << std::endl;
//std::cout << "f3's result: " << f3_str << std::endl;
std::cout << "thread_pool time = "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std::
endl;
//cleanup threads
//thread_pool.shutdown(); test dtor
}
catch (...)
{
std::cout << "SimpleTest exception" << std::endl;
}
};
//==============================================================
// Parallel vs binary search test class
// t - number of threads ( > 1)
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Binary = std::find (single threaded for comparison).
7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
// Parallel: (1) Split array into equal chuncks, push them onto thread pool
// (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val
<= end().
// (3) Chunk returning true replaces array and step(1) repeated.
// N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match
// Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is
quicker if early match found).
// Benchmarks show high overhead of thread pool.
//===============================================================
struct ParallelSearch
{
//Choose which to run
bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to
100 since the search is time consuming)
bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below,
to 100 since the search is time consuming)
bool is_simd = true; //run simd-based lambda
//Choose number of threads
//unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead)
//const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription,
should run slower than optimal
const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal
choice
//const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive
oversubsrciption, should run slower tha optimal
ParallelSearch(void) try
{
std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion
//Create large, sorted array on heap to avoid seg fault.
const unsigned size = 2 << 24;
std::vector<unsigned> my_array(size);
for (auto& elem : my_array)
{
static unsigned i = 0;
elem = 2 * i; //even numbers
++i;
}
//double-word atomic containing the address of a matching chunk and the new new chunk length (size,
size/t, size/t^2 ...)
struct DoubleWord
{
unsigned* m_address;
unsigned m_chunk_length;
};
std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads });
//val seacrched for (TODO: binary search faster than parallel search if binary finds early match.
Need to terminate parallel search earlier)
bool even = true;
unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); //
even/odd number --> found/not found
//Variables for found position, passes taken and whether to printout progress(incurs overhead)
unsigned* ret_val = &my_array[0];
int passes = 0; //int required by g++ autovectorize
bool printout = false;
//SIMD lambda (Proved to be quite difficult getting g++ to autovectorise)
//(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor)
// 1. Split array into t chunks
// 2. Allocate chunks to t SIMD lanes
// 3. Each lane checks chunk.begin() <= search-val <= chunk.end()
// 4. The SIMD lane getting a match set array = chunk
// 5. Steps 1 to 4 repeated until chunk is 1 element long.
8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
std::function<bool(void)> simd_search = [&]()
{
//Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register)
const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi
?id=56787)
//Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s))
alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15,
size >> 18, size >> 21, size >> 24, 1};
//Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8,
lower[n]/64 ...
alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size,
7 * size};
//Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised
alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2};
//Running tally of start of chunk to be searched
alignas(alignment) int offset = 0;
alignas(alignment) int tmp_offset = 0;
//Loop until chunk length is 1 element
for (passes = 0; passes<9; ++passes)
{
//Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are
due to memory stalls.
//It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this
search algorithm is unavoidably
//memory-bound unless something along the lines of a heap-ordered array (i.e array is laid
out as a breadth-first n-ary tree) is
//used to convert random access to linear access without need for scatter-gather.
//#pragma omp parallel for //Adds too much overhead
[&]() //Sadly, won't vectorise due to function call
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data
they will use
{
int tmp = pos*chunk_length[passes]; //Get lower index
for chunk interval
__builtin_prefetch(&my_array[0] + offset + tmp); //See if it
removes hotspot from "LINE X", below
__builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if
it removes hotspot from "LINE Y", below
}
}();
//Fork: Assign each chunk to an SIMD lane
//N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not
vectorised. This does autovectorise under g++ 4.8
//N.B. Code has been borken down into painfully simple steps to help autovectoriser and
pinpoint which operations are causing trouble
[&]()
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
alignas(alignment) int chunk = chunk_length[passes];
for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes
{
//Find matching chunk by adding 0 to offset for no-match and chunk address for a
match
alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset
//int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi?
id=56787). Can't use 32-byte AVX.
alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range
alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk
range
unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
prefetch)
unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above
prefecth)
alignas(alignment) bool test_lower = lower_val <= val; //Lower
alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check
alignas(alignment) bool test = test_lower && test_upper; // is search-val inside
chunk for this SIMD lane?
tmp_offset += test*tmp; //Horrible construct to get it to autovec.
It masks out SIMD lanes that don't contain search val.
//Following fails because it is "not suitable for gather" (whatever that means)
//offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+
chunk_length[passes]-1]))*tmp;
//Following fails because of "control flow" (Can't see why g++ doesn't autovec it,
control flow can be reaplced with masked op)
//if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length
[passes]-1])) tmp_offset = tmp;
}
}();
//Join: end of SIMD
//Update chunk start address index
offset = tmp_offset;
/*std::cout << "offset=" << offset << std::endl;
std::cout << "passes=" << passes;
std::cout << ", val=" << val;
std::cout << ", range=[" << array[offset] << "," << array[offset+1];
std::cout<< ", chunk length=" << chunk_length[passes] << std::endl;
*/
}
//Update final index of search-val
ret_val = &my_array[0] + offset;
return true;
};
//Lock-free lambda for each thead
//Operation:
//1. The array is split into t (num of threads) chunks
//2. Each thread examines its chunk
//3. If a match is found in a chunk, the thread changes the array to be that chunk.
//4. The process repeated from step 1.
//t threads continous monitor the array and process their chunk of the array. Since the array
pointer is
// atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is
picked up
// by all threads. No synchronisation is needed.
// arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+
chunk_length/t, begin+2*chunk_length/t], ..)
std::atomic<unsigned> running_threads(0);
std::atomic<bool> go(false);
std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos)
{
//Increment running thread count
++running_threads;
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Keep searching until a thread notifies completion.
while (!done)
{
//capture chunk address and length
DoubleWord capture = chunk_address_and_length;
//Check if search-val between chunk.begin() and chunk.end()
unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length;
unsigned *end = begin + capture.m_chunk_length - 1;
unsigned test1 = *begin, test2 = *end;
if (*begin <= val && val <= *end)
10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
//Print out iterations (adds significant overhead)
static std::mutex printout_mutex;
if (printout)
{
std::unique_lock<std::mutex> lock(printout_mutex);
//Print out iterations (adds significant overhead)
std::cout << "Parallel find (pass " << passes << "): Closest match "
<< *begin << "<=" << val << "<=" << *end
<< ", chunk length=" << capture.m_chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = begin;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
std::function<void(void)> branch_true = [&]() //IF
{
//Update chunk length and address
capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture.
m_chunk_length / num_threads) : 1); //divide chunk evenly
capture.m_address = begin; //point to this chunk
chunk_address_and_length = capture;
};
std::function<void(void)> branch_false = [&]() //ELSE
{
done = true; //notify parent and sister threads of completion
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[capture.m_chunk_length > 1](); //if-else
}
else
{
std::this_thread::sleep_for(std::chrono::nanoseconds(5000));
}
}
return true;;
};
//Create thread pool for lock-based search
static ThreadPool thread_pool(num_threads);
//Notification of completion of lock-based search
std::condition_variable finished;
// lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos
+ chuhnk_length]
// and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion
point.
std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) //
{
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Check if search-val between chunk.begin() and chunk.end()
if (*tmp <= val && val <= *(tmp + chunk_length - 1))
{
//Print out iterations (adds significant overhead)
if (printout)
{
std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" <<
val
<< "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp
+ 1))
<< ", chunk length=" << chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = tmp;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
//Spawn new tasks to process this chunk
//Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to
replace if-else
//need VTune to test out whether it saves us any mispredictions.
std::function<void(void)> branch_true = [&]() //IF
{
chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1);
//divide chunk evenly
for (unsigned index = 0; index < num_threads; ++index)
{
thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length);
}
};
std::function<void(void)> branch_false = [&]() //ELSE
{
finished.notify_one(); //chunk length is 1, so we are finished dividing-and-
conquering
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[chunk_length>1](); //if-else
}
return true;
};
std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl;
//Obtain position of element (to verify parallel search finds correct position).
auto pos = std::find(my_array.begin(), my_array.end(), val);
//Ordinary binary search for timing comparison
std::cout << std::endl << "=========================================================================
=====" << std::endl;
std::cout << "Running binary search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" << std
::endl;
unsigned factor = 10000; //number of times to run search
auto start_binary = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); //
binary search
VTUNE(__itt_pause();)
auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer
//print out results of binary search
using std::chrono::duration_cast;
using std::chrono::nanoseconds;
std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock::
period::num) << " ns" << std::endl;
std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ",
index=" << pos - my_array.begin()
<< ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast
<nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl;
//Parallel searches
//SIMD search
if (is_simd)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
//Kick off the search
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "simd_search()",
for (unsigned i = 0; i<factor; i++) simd_search();
)
//Wait for result and then get the intertion point and number of paasses
12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count();
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Simd results:" << std::endl;
std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl;
std::cout << "Search repeated " << factor << " times" << std::endl;
std::cout << "number of threads=" << running_threads << std::endl;
std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," <<
ret_val[1] << "]";
std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl;
std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl;
std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time
<< "ns";
std::cout << " = ";
std::cout << parallel_time / binary_time << std::endl;
std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::
endl;
}
//Lock-free multithreaded search
if (is_lock_free)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads };
done = false;
//Kick off the search
//auto start = std::chrono::high_resolution_clock::now(); //start timer
std::vector<std::future<bool>> futures;
go = false;
futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0)));
for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos)
{
futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos)));
}
//Wait for result and then get the intertion point and number of paasses
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "lock_free()",
go = true; futures[0].get();
)
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
}
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Lock free results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
if (is_lock_based)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
//Kick off the search
//auto start = std::chrono::high_resolution_clock::now(); //start timer
go = false;
auto f = thread_pool.push(lock_based, &my_array[0], size);
//Wait for result and then get the intertion point and number of paasses
{
//wait for completion
std::mutex dummy;
std::unique_lock<std::mutex> lock(dummy);
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "lock_based()",
go = true;
finished.wait(lock);
)
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
thread_pool.cancel_tasks();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
//kill thread pool
thread_pool.shutdown(false);
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Lock based results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
}
catch (...)
{
std::cout << "ParallelSearch exception" << std::endl;
}
};
int main(void)
{
//SimpleTest simple_test;
VTUNE(__itt_pause();)
ParallelSearch parallel_search;
char c;
std::cout << "Press any key to exit" << std::endl;
std::cin >> c; //keep console alive
}
Ad

More Related Content

What's hot (19)

Rooted 2010 ppp
Rooted 2010 pppRooted 2010 ppp
Rooted 2010 ppp
noc_313
 
C99
C99C99
C99
scriptexe
 
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Asciidoctor New, Noteworthy and Beyond Devoxx-2017Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Alex Soto
 
Teaching Your Machine To Find Fraudsters
Teaching Your Machine To Find FraudstersTeaching Your Machine To Find Fraudsters
Teaching Your Machine To Find Fraudsters
Ian Barber
 
zinno
zinnozinno
zinno
guest6a7933
 
Debugging: Rules And Tools - PHPTek 11 Version
Debugging: Rules And Tools - PHPTek 11 VersionDebugging: Rules And Tools - PHPTek 11 Version
Debugging: Rules And Tools - PHPTek 11 Version
Ian Barber
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoIntroduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Masahiro Nagano
 
Cod
CodCod
Cod
Stan Adrian
 
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Mail.ru Group
 
"let ECMAScript = 6"
"let ECMAScript = 6" "let ECMAScript = 6"
"let ECMAScript = 6"
The Software House
 
PHP Static Code Review
PHP Static Code ReviewPHP Static Code Review
PHP Static Code Review
Damien Seguy
 
Your code is not a string
Your code is not a stringYour code is not a string
Your code is not a string
Ingvar Stepanyan
 
Php 101: PDO
Php 101: PDOPhp 101: PDO
Php 101: PDO
Jeremy Kendall
 
Drush. Secrets come out.
Drush. Secrets come out.Drush. Secrets come out.
Drush. Secrets come out.
Alex S
 
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
How I Built a Power Debugger Out of the Standard Library and Things I Found o...How I Built a Power Debugger Out of the Standard Library and Things I Found o...
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
doughellmann
 
Twitter codeigniter library
Twitter codeigniter libraryTwitter codeigniter library
Twitter codeigniter library
Navaneeswar Reddy
 
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
NoSQLmatters
 
PHP Data Objects
PHP Data ObjectsPHP Data Objects
PHP Data Objects
Wez Furlong
 
PHP 5.4
PHP 5.4PHP 5.4
PHP 5.4
Federico Damián Lozada Mosto
 
Rooted 2010 ppp
Rooted 2010 pppRooted 2010 ppp
Rooted 2010 ppp
noc_313
 
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Asciidoctor New, Noteworthy and Beyond Devoxx-2017Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Alex Soto
 
Teaching Your Machine To Find Fraudsters
Teaching Your Machine To Find FraudstersTeaching Your Machine To Find Fraudsters
Teaching Your Machine To Find Fraudsters
Ian Barber
 
Debugging: Rules And Tools - PHPTek 11 Version
Debugging: Rules And Tools - PHPTek 11 VersionDebugging: Rules And Tools - PHPTek 11 Version
Debugging: Rules And Tools - PHPTek 11 Version
Ian Barber
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoIntroduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Masahiro Nagano
 
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Mail.ru Group
 
PHP Static Code Review
PHP Static Code ReviewPHP Static Code Review
PHP Static Code Review
Damien Seguy
 
Drush. Secrets come out.
Drush. Secrets come out.Drush. Secrets come out.
Drush. Secrets come out.
Alex S
 
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
How I Built a Power Debugger Out of the Standard Library and Things I Found o...How I Built a Power Debugger Out of the Standard Library and Things I Found o...
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
doughellmann
 
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
NoSQLmatters
 
PHP Data Objects
PHP Data ObjectsPHP Data Objects
PHP Data Objects
Wez Furlong
 

Viewers also liked (7)

Shared_memory_hash_table
Shared_memory_hash_tableShared_memory_hash_table
Shared_memory_hash_table
Russell Childs
 
Algorithms devised for a google interview
Algorithms devised for a google interviewAlgorithms devised for a google interview
Algorithms devised for a google interview
Russell Childs
 
Dynamic programming burglar_problem
Dynamic programming burglar_problemDynamic programming burglar_problem
Dynamic programming burglar_problem
Russell Childs
 
Cpp11 sample linux
Cpp11 sample linuxCpp11 sample linux
Cpp11 sample linux
Russell Childs
 
Simple shared mutex UML
Simple shared mutex UMLSimple shared mutex UML
Simple shared mutex UML
Russell Childs
 
Full resume dr_russell_john_childs_2016
Full resume dr_russell_john_childs_2016Full resume dr_russell_john_childs_2016
Full resume dr_russell_john_childs_2016
Russell Childs
 
Interview C++11 code
Interview C++11 codeInterview C++11 code
Interview C++11 code
Russell Childs
 
Shared_memory_hash_table
Shared_memory_hash_tableShared_memory_hash_table
Shared_memory_hash_table
Russell Childs
 
Algorithms devised for a google interview
Algorithms devised for a google interviewAlgorithms devised for a google interview
Algorithms devised for a google interview
Russell Childs
 
Dynamic programming burglar_problem
Dynamic programming burglar_problemDynamic programming burglar_problem
Dynamic programming burglar_problem
Russell Childs
 
Simple shared mutex UML
Simple shared mutex UMLSimple shared mutex UML
Simple shared mutex UML
Russell Childs
 
Full resume dr_russell_john_childs_2016
Full resume dr_russell_john_childs_2016Full resume dr_russell_john_childs_2016
Full resume dr_russell_john_childs_2016
Russell Childs
 
Ad

Similar to Cpp11 multithreading and_simd_linux_code (20)

Php 5.6
Php 5.6Php 5.6
Php 5.6
Federico Damián Lozada Mosto
 
Deploying Plone and Volto, the Hard Way
Deploying Plone and Volto, the Hard WayDeploying Plone and Volto, the Hard Way
Deploying Plone and Volto, the Hard Way
Asko Soukka
 
PHP CLI: A Cinderella Story
PHP CLI: A Cinderella StoryPHP CLI: A Cinderella Story
PHP CLI: A Cinderella Story
Mike Lively
 
Augeas @RMLL 2012
Augeas @RMLL 2012Augeas @RMLL 2012
Augeas @RMLL 2012
Raphaël PINSON
 
Configuration Surgery with Augeas
Configuration Surgery with AugeasConfiguration Surgery with Augeas
Configuration Surgery with Augeas
Puppet
 
My shell
My shellMy shell
My shell
Ahmed Salah
 
A journey through the years of UNIX and Linux service management
A journey through the years of UNIX and Linux service managementA journey through the years of UNIX and Linux service management
A journey through the years of UNIX and Linux service management
Lubomir Rintel
 
Introduction to Apache Mesos
Introduction to Apache MesosIntroduction to Apache Mesos
Introduction to Apache Mesos
Joe Stein
 
Quick tour of PHP from inside
Quick tour of PHP from insideQuick tour of PHP from inside
Quick tour of PHP from inside
julien pauli
 
Program Assignment Process ManagementObjective This program a.docx
Program Assignment  Process ManagementObjective This program a.docxProgram Assignment  Process ManagementObjective This program a.docx
Program Assignment Process ManagementObjective This program a.docx
wkyra78
 
solving little problems
solving little problemssolving little problems
solving little problems
removed_e334947d661d520b05c7f698a45590c4
 
Ansible inside
Ansible insideAnsible inside
Ansible inside
Ideato
 
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Modern Data Stack France
 
Harmonious Development: Via Vagrant and Puppet
Harmonious Development: Via Vagrant and PuppetHarmonious Development: Via Vagrant and Puppet
Harmonious Development: Via Vagrant and Puppet
Achieve Internet
 
Assignment of SOS operating systemThe file lmemman.c has one incom.pdf
Assignment of SOS operating systemThe file lmemman.c has one incom.pdfAssignment of SOS operating systemThe file lmemman.c has one incom.pdf
Assignment of SOS operating systemThe file lmemman.c has one incom.pdf
sktambifortune
 
Penetration Testing for Easy RM to MP3 Converter Application and Post Exploit
Penetration Testing for Easy RM to MP3 Converter Application and Post ExploitPenetration Testing for Easy RM to MP3 Converter Application and Post Exploit
Penetration Testing for Easy RM to MP3 Converter Application and Post Exploit
JongWon Kim
 
start_printf: dev/ic/com.c comstart()
start_printf: dev/ic/com.c comstart()start_printf: dev/ic/com.c comstart()
start_printf: dev/ic/com.c comstart()
Kiwamu Okabe
 
The Rust Programming Language: an Overview
The Rust Programming Language: an OverviewThe Rust Programming Language: an Overview
The Rust Programming Language: an Overview
Roberto Casadei
 
Vagrant for real
Vagrant for realVagrant for real
Vagrant for real
Michele Orselli
 
Puppet and the HashiStack
Puppet and the HashiStackPuppet and the HashiStack
Puppet and the HashiStack
Bram Vogelaar
 
Deploying Plone and Volto, the Hard Way
Deploying Plone and Volto, the Hard WayDeploying Plone and Volto, the Hard Way
Deploying Plone and Volto, the Hard Way
Asko Soukka
 
PHP CLI: A Cinderella Story
PHP CLI: A Cinderella StoryPHP CLI: A Cinderella Story
PHP CLI: A Cinderella Story
Mike Lively
 
Configuration Surgery with Augeas
Configuration Surgery with AugeasConfiguration Surgery with Augeas
Configuration Surgery with Augeas
Puppet
 
A journey through the years of UNIX and Linux service management
A journey through the years of UNIX and Linux service managementA journey through the years of UNIX and Linux service management
A journey through the years of UNIX and Linux service management
Lubomir Rintel
 
Introduction to Apache Mesos
Introduction to Apache MesosIntroduction to Apache Mesos
Introduction to Apache Mesos
Joe Stein
 
Quick tour of PHP from inside
Quick tour of PHP from insideQuick tour of PHP from inside
Quick tour of PHP from inside
julien pauli
 
Program Assignment Process ManagementObjective This program a.docx
Program Assignment  Process ManagementObjective This program a.docxProgram Assignment  Process ManagementObjective This program a.docx
Program Assignment Process ManagementObjective This program a.docx
wkyra78
 
Ansible inside
Ansible insideAnsible inside
Ansible inside
Ideato
 
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Modern Data Stack France
 
Harmonious Development: Via Vagrant and Puppet
Harmonious Development: Via Vagrant and PuppetHarmonious Development: Via Vagrant and Puppet
Harmonious Development: Via Vagrant and Puppet
Achieve Internet
 
Assignment of SOS operating systemThe file lmemman.c has one incom.pdf
Assignment of SOS operating systemThe file lmemman.c has one incom.pdfAssignment of SOS operating systemThe file lmemman.c has one incom.pdf
Assignment of SOS operating systemThe file lmemman.c has one incom.pdf
sktambifortune
 
Penetration Testing for Easy RM to MP3 Converter Application and Post Exploit
Penetration Testing for Easy RM to MP3 Converter Application and Post ExploitPenetration Testing for Easy RM to MP3 Converter Application and Post Exploit
Penetration Testing for Easy RM to MP3 Converter Application and Post Exploit
JongWon Kim
 
start_printf: dev/ic/com.c comstart()
start_printf: dev/ic/com.c comstart()start_printf: dev/ic/com.c comstart()
start_printf: dev/ic/com.c comstart()
Kiwamu Okabe
 
The Rust Programming Language: an Overview
The Rust Programming Language: an OverviewThe Rust Programming Language: an Overview
The Rust Programming Language: an Overview
Roberto Casadei
 
Puppet and the HashiStack
Puppet and the HashiStackPuppet and the HashiStack
Puppet and the HashiStack
Bram Vogelaar
 
Ad

More from Russell Childs (20)

spinor_quantum_simulator_user_guide_.pdf
spinor_quantum_simulator_user_guide_.pdfspinor_quantum_simulator_user_guide_.pdf
spinor_quantum_simulator_user_guide_.pdf
Russell Childs
 
String searching o_n
String searching o_nString searching o_n
String searching o_n
Russell Childs
 
String searching o_n
String searching o_nString searching o_n
String searching o_n
Russell Childs
 
String searching o_n
String searching o_nString searching o_n
String searching o_n
Russell Childs
 
String searching
String searchingString searching
String searching
Russell Childs
 
Permute
PermutePermute
Permute
Russell Childs
 
Permute
PermutePermute
Permute
Russell Childs
 
Feature extraction using adiabatic theorem
Feature extraction using adiabatic theoremFeature extraction using adiabatic theorem
Feature extraction using adiabatic theorem
Russell Childs
 
Feature extraction using adiabatic theorem
Feature extraction using adiabatic theoremFeature extraction using adiabatic theorem
Feature extraction using adiabatic theorem
Russell Childs
 
Wavelets_and_multiresolution_in_two_pages
Wavelets_and_multiresolution_in_two_pagesWavelets_and_multiresolution_in_two_pages
Wavelets_and_multiresolution_in_two_pages
Russell Childs
 
Relativity 2
Relativity 2Relativity 2
Relativity 2
Russell Childs
 
Recursion to iteration automation.
Recursion to iteration automation.Recursion to iteration automation.
Recursion to iteration automation.
Russell Childs
 
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Russell Childs
 
Design pattern to avoid downcasting
Design pattern to avoid downcastingDesign pattern to avoid downcasting
Design pattern to avoid downcasting
Russell Childs
 
Interview uml design
Interview uml designInterview uml design
Interview uml design
Russell Childs
 
Full_resume_Dr_Russell_John_Childs
Full_resume_Dr_Russell_John_ChildsFull_resume_Dr_Russell_John_Childs
Full_resume_Dr_Russell_John_Childs
Russell Childs
 
K d tree_cpp
K d tree_cppK d tree_cpp
K d tree_cpp
Russell Childs
 
Multithreaded sockets c++11
Multithreaded sockets c++11Multithreaded sockets c++11
Multithreaded sockets c++11
Russell Childs
 
IBM Kinexa Prove It! C programming test results.
IBM Kinexa Prove It! C programming test results.IBM Kinexa Prove It! C programming test results.
IBM Kinexa Prove It! C programming test results.
Russell Childs
 
IBM Kinexa Prove It! C++ programming test results.
IBM Kinexa Prove It! C++ programming test results.IBM Kinexa Prove It! C++ programming test results.
IBM Kinexa Prove It! C++ programming test results.
Russell Childs
 
spinor_quantum_simulator_user_guide_.pdf
spinor_quantum_simulator_user_guide_.pdfspinor_quantum_simulator_user_guide_.pdf
spinor_quantum_simulator_user_guide_.pdf
Russell Childs
 
Feature extraction using adiabatic theorem
Feature extraction using adiabatic theoremFeature extraction using adiabatic theorem
Feature extraction using adiabatic theorem
Russell Childs
 
Feature extraction using adiabatic theorem
Feature extraction using adiabatic theoremFeature extraction using adiabatic theorem
Feature extraction using adiabatic theorem
Russell Childs
 
Wavelets_and_multiresolution_in_two_pages
Wavelets_and_multiresolution_in_two_pagesWavelets_and_multiresolution_in_two_pages
Wavelets_and_multiresolution_in_two_pages
Russell Childs
 
Recursion to iteration automation.
Recursion to iteration automation.Recursion to iteration automation.
Recursion to iteration automation.
Russell Childs
 
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Russell Childs
 
Design pattern to avoid downcasting
Design pattern to avoid downcastingDesign pattern to avoid downcasting
Design pattern to avoid downcasting
Russell Childs
 
Full_resume_Dr_Russell_John_Childs
Full_resume_Dr_Russell_John_ChildsFull_resume_Dr_Russell_John_Childs
Full_resume_Dr_Russell_John_Childs
Russell Childs
 
Multithreaded sockets c++11
Multithreaded sockets c++11Multithreaded sockets c++11
Multithreaded sockets c++11
Russell Childs
 
IBM Kinexa Prove It! C programming test results.
IBM Kinexa Prove It! C programming test results.IBM Kinexa Prove It! C programming test results.
IBM Kinexa Prove It! C programming test results.
Russell Childs
 
IBM Kinexa Prove It! C++ programming test results.
IBM Kinexa Prove It! C++ programming test results.IBM Kinexa Prove It! C++ programming test results.
IBM Kinexa Prove It! C++ programming test results.
Russell Childs
 

Recently uploaded (20)

How analogue intelligence complements AI
How analogue intelligence complements AIHow analogue intelligence complements AI
How analogue intelligence complements AI
Paul Rowe
 
UiPath Agentic Automation: Community Developer Opportunities
UiPath Agentic Automation: Community Developer OpportunitiesUiPath Agentic Automation: Community Developer Opportunities
UiPath Agentic Automation: Community Developer Opportunities
DianaGray10
 
The Changing Compliance Landscape in 2025.pdf
The Changing Compliance Landscape in 2025.pdfThe Changing Compliance Landscape in 2025.pdf
The Changing Compliance Landscape in 2025.pdf
Precisely
 
Play It Safe: Manage Security Risks - Google Certificate
Play It Safe: Manage Security Risks - Google CertificatePlay It Safe: Manage Security Risks - Google Certificate
Play It Safe: Manage Security Risks - Google Certificate
VICTOR MAESTRE RAMIREZ
 
fennec fox optimization algorithm for optimal solution
fennec fox optimization algorithm for optimal solutionfennec fox optimization algorithm for optimal solution
fennec fox optimization algorithm for optimal solution
shallal2
 
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Raffi Khatchadourian
 
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Raffi Khatchadourian
 
Zilliz Cloud Monthly Technical Review: May 2025
Zilliz Cloud Monthly Technical Review: May 2025Zilliz Cloud Monthly Technical Review: May 2025
Zilliz Cloud Monthly Technical Review: May 2025
Zilliz
 
AI 3-in-1: Agents, RAG, and Local Models - Brent Laster
AI 3-in-1: Agents, RAG, and Local Models - Brent LasterAI 3-in-1: Agents, RAG, and Local Models - Brent Laster
AI 3-in-1: Agents, RAG, and Local Models - Brent Laster
All Things Open
 
Vaibhav Gupta BAML: AI work flows without Hallucinations
Vaibhav Gupta BAML: AI work flows without HallucinationsVaibhav Gupta BAML: AI work flows without Hallucinations
Vaibhav Gupta BAML: AI work flows without Hallucinations
john409870
 
TrsLabs - Leverage the Power of UPI Payments
TrsLabs - Leverage the Power of UPI PaymentsTrsLabs - Leverage the Power of UPI Payments
TrsLabs - Leverage the Power of UPI Payments
Trs Labs
 
TrsLabs - Fintech Product & Business Consulting
TrsLabs - Fintech Product & Business ConsultingTrsLabs - Fintech Product & Business Consulting
TrsLabs - Fintech Product & Business Consulting
Trs Labs
 
TrsLabs - AI Agents for All - Chatbots to Multi-Agents Systems
TrsLabs - AI Agents for All - Chatbots to Multi-Agents SystemsTrsLabs - AI Agents for All - Chatbots to Multi-Agents Systems
TrsLabs - AI Agents for All - Chatbots to Multi-Agents Systems
Trs Labs
 
Generative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in BusinessGenerative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in Business
Dr. Tathagat Varma
 
MINDCTI revenue release Quarter 1 2025 PR
MINDCTI revenue release Quarter 1 2025 PRMINDCTI revenue release Quarter 1 2025 PR
MINDCTI revenue release Quarter 1 2025 PR
MIND CTI
 
Unlocking Generative AI in your Web Apps
Unlocking Generative AI in your Web AppsUnlocking Generative AI in your Web Apps
Unlocking Generative AI in your Web Apps
Maximiliano Firtman
 
UiPath Automation Suite – Cas d'usage d'une NGO internationale basée à Genève
UiPath Automation Suite – Cas d'usage d'une NGO internationale basée à GenèveUiPath Automation Suite – Cas d'usage d'une NGO internationale basée à Genève
UiPath Automation Suite – Cas d'usage d'une NGO internationale basée à Genève
UiPathCommunity
 
Designing Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep Dive
Designing Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep DiveDesigning Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep Dive
Designing Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep Dive
ScyllaDB
 
Transcript: Canadian book publishing: Insights from the latest salary survey ...
Transcript: Canadian book publishing: Insights from the latest salary survey ...Transcript: Canadian book publishing: Insights from the latest salary survey ...
Transcript: Canadian book publishing: Insights from the latest salary survey ...
BookNet Canada
 
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-UmgebungenHCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
panagenda
 
How analogue intelligence complements AI
How analogue intelligence complements AIHow analogue intelligence complements AI
How analogue intelligence complements AI
Paul Rowe
 
UiPath Agentic Automation: Community Developer Opportunities
UiPath Agentic Automation: Community Developer OpportunitiesUiPath Agentic Automation: Community Developer Opportunities
UiPath Agentic Automation: Community Developer Opportunities
DianaGray10
 
The Changing Compliance Landscape in 2025.pdf
The Changing Compliance Landscape in 2025.pdfThe Changing Compliance Landscape in 2025.pdf
The Changing Compliance Landscape in 2025.pdf
Precisely
 
Play It Safe: Manage Security Risks - Google Certificate
Play It Safe: Manage Security Risks - Google CertificatePlay It Safe: Manage Security Risks - Google Certificate
Play It Safe: Manage Security Risks - Google Certificate
VICTOR MAESTRE RAMIREZ
 
fennec fox optimization algorithm for optimal solution
fennec fox optimization algorithm for optimal solutionfennec fox optimization algorithm for optimal solution
fennec fox optimization algorithm for optimal solution
shallal2
 
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Raffi Khatchadourian
 
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Hybridize Functions: A Tool for Automatically Refactoring Imperative Deep Lea...
Raffi Khatchadourian
 
Zilliz Cloud Monthly Technical Review: May 2025
Zilliz Cloud Monthly Technical Review: May 2025Zilliz Cloud Monthly Technical Review: May 2025
Zilliz Cloud Monthly Technical Review: May 2025
Zilliz
 
AI 3-in-1: Agents, RAG, and Local Models - Brent Laster
AI 3-in-1: Agents, RAG, and Local Models - Brent LasterAI 3-in-1: Agents, RAG, and Local Models - Brent Laster
AI 3-in-1: Agents, RAG, and Local Models - Brent Laster
All Things Open
 
Vaibhav Gupta BAML: AI work flows without Hallucinations
Vaibhav Gupta BAML: AI work flows without HallucinationsVaibhav Gupta BAML: AI work flows without Hallucinations
Vaibhav Gupta BAML: AI work flows without Hallucinations
john409870
 
TrsLabs - Leverage the Power of UPI Payments
TrsLabs - Leverage the Power of UPI PaymentsTrsLabs - Leverage the Power of UPI Payments
TrsLabs - Leverage the Power of UPI Payments
Trs Labs
 
TrsLabs - Fintech Product & Business Consulting
TrsLabs - Fintech Product & Business ConsultingTrsLabs - Fintech Product & Business Consulting
TrsLabs - Fintech Product & Business Consulting
Trs Labs
 
TrsLabs - AI Agents for All - Chatbots to Multi-Agents Systems
TrsLabs - AI Agents for All - Chatbots to Multi-Agents SystemsTrsLabs - AI Agents for All - Chatbots to Multi-Agents Systems
TrsLabs - AI Agents for All - Chatbots to Multi-Agents Systems
Trs Labs
 
Generative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in BusinessGenerative Artificial Intelligence (GenAI) in Business
Generative Artificial Intelligence (GenAI) in Business
Dr. Tathagat Varma
 
MINDCTI revenue release Quarter 1 2025 PR
MINDCTI revenue release Quarter 1 2025 PRMINDCTI revenue release Quarter 1 2025 PR
MINDCTI revenue release Quarter 1 2025 PR
MIND CTI
 
Unlocking Generative AI in your Web Apps
Unlocking Generative AI in your Web AppsUnlocking Generative AI in your Web Apps
Unlocking Generative AI in your Web Apps
Maximiliano Firtman
 
UiPath Automation Suite – Cas d'usage d'une NGO internationale basée à Genève
UiPath Automation Suite – Cas d'usage d'une NGO internationale basée à GenèveUiPath Automation Suite – Cas d'usage d'une NGO internationale basée à Genève
UiPath Automation Suite – Cas d'usage d'une NGO internationale basée à Genève
UiPathCommunity
 
Designing Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep Dive
Designing Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep DiveDesigning Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep Dive
Designing Low-Latency Systems with Rust and ScyllaDB: An Architectural Deep Dive
ScyllaDB
 
Transcript: Canadian book publishing: Insights from the latest salary survey ...
Transcript: Canadian book publishing: Insights from the latest salary survey ...Transcript: Canadian book publishing: Insights from the latest salary survey ...
Transcript: Canadian book publishing: Insights from the latest salary survey ...
BookNet Canada
 
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-UmgebungenHCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
HCL Nomad Web – Best Practices und Verwaltung von Multiuser-Umgebungen
panagenda
 

Cpp11 multithreading and_simd_linux_code

  • 1. 1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp /* * multi.cpp * * Created on: 13 Mar 2014 * Author: Russell John Childs. */ //======================================================================================================= // COPYRIGHT NOTICE // This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been // distributed as a representative example of my use of C++11 features. //========================================================================================================== ============ //==================================================== // File contains // (1) Implementation of lock-based thread pool. // (2) Implementation of lock-free "thread pool". // (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD parallel for): // 1) Split sorted array into <num_threads> equal chunks // 2) Assign each chunk to a thread. // 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end() // 4) Replace array with chunk that returned true and return to step 1 // Complexity: // t - number of threads ( > 1 ) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Compiling this code sample (Linux Mint - g++ 4.8) // // Compiler options: // g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer // --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include / // multithreading.cpp // // Linker options: // g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $ (LIBS) -lpthread -latomic -littnotify -ldl // //============================================================== #include <thread> #include <future> #include <condition_variable> #include <atomic> #include <functional> #include <deque> #include <vector> #include <set> #include <iostream> #include <string> #include <sstream> #include <cmath> #include <algorithm> #include <omp.h> #include <immintrin.h> //#include <cilk/cilk.h> //Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler. #define INTEL_NO_ITTNOTIFY_API //include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined #ifndef INTEL_NO_ITTNOTIFY_API #include "ittnotify.h" #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE(STATEMENT) #else
  • 2. 2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp #define VTUNE(STATEMENT) STATEMENT #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS #else #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) { auto domain = __itt_domain_create(DOMAIN); __itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); STATEMENTS __itt_task_end(domain); } #endif // ================================================================= // Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ... // storable in STL container for thread pool. // === // N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type (Type)>(type), // since packaged_task has void operator()(void). However, there is a problem: std::function // requires command object to be copyable and packaged_task has move-only semantics. //================================================================== //============================ //Primary template //============================ template< typename Out = void, typename In = void > struct MyPackagedTask { virtual ~MyPackagedTask(void) { } }; //============================ //Explicit specialization, acts as base class // MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>; // poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op() //============================ template<> struct MyPackagedTask<> { virtual ~MyPackagedTask(void) { } virtual void operator()(void) { } }; std::mutex last_return_mutex; //============================ //Specialization for function signature // MyPackagedTaks<MyType(OtherType)> //============================ template< typename Out, typename... In > struct MyPackagedTask< Out(In...) > : public MyPackagedTask<> { MyPackagedTask(std::function<Out(In...)> func, In... in) : m_task(std::bind(func, in...)) { } virtual ~MyPackagedTask(void) { } MyPackagedTask(MyPackagedTask&& other) : m_task(std::move(other.m_task))
  • 3. 3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { } void operator()(void) { m_task(); } std::future<Out> get_future(void) { return m_task.get_future(); } private: std::packaged_task<Out(void)> m_task; }; //====================================================================== // Simple thread pool class // Places tasks onto common queue // Allocates fixed number of threads which pop tasks. // TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation. //==================================================================== class ThreadPool { public: ThreadPool(unsigned max_num_threads = 1U << 31) : m_done(false), //notice to threads to shut down m_print_shutdown_msg(true), //print or not print shutdown msg m_max_num_threads(max_num_threads), //maximum num threads allowed in pool m_num_threads(0), //num threads allocated by the pool m_processing(0), //tasks still running m_cancel(false) { } ~ThreadPool(void) { //Shut down threads iff user has not alread called shutdown() if (!m_done) { shutdown(); } } //================= // Push task onto pool //================ template< typename Out, typename... In > std::future<Out> push(std::function<Out(In...)> func, In... in) { //Create task, store future MyPackagedTask<Out(In...)> task(func, in...); std::future<Out> ret_val = task.get_future(); //lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release lock if (m_cancel == false) { { std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); // Base*=&Derived for poly call std::lock_guard<std::mutex> lock(m_tasks); //lock queue m_pool.push_back(std::move(ptr)); //push task } //release lock m_condition_variable.notify_all(); //notify waiting threads //spawn a thread(async will prevent oversubscription) and store thread future(to check for thread termination at pool shutdown) if ((++m_num_threads <= m_max_num_threads)) { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this)); }
  • 4. 4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp } //return packaged_task future so that caller can wait for result return ret_val; } //================= // get number of threads allocated //================ unsigned get_num_threads(void) { std::unique_lock<std::mutex> lock(m_threads); return m_thread_list.size(); } //================= // Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest tested. //================ void cancel_tasks(void) { m_cancel = true; while (m_processing != 0); { std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_pool.clear(); } m_cancel = false; } //================= // Kill all threads and print out shutdown message (iff msg==true) //================ void shutdown(bool msg = true) { m_print_shutdown_msg = msg; { if (m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "================================================================= " << std::endl << "Shutting down threads: "; } } cancel_tasks(); //Notify all threads of thread pool termination m_done = true; m_condition_variable.notify_all(); //Loop over all threads and wait for them to terminate { std::unique_lock<std::mutex> lock(m_threads); for (auto& elem : m_thread_list) { while (!elem.valid()); elem.get(); } } //Clear thread queue { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.clear(); } //Print out shutdown message if (m_print_shutdown_msg)
  • 5. 5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "=================================================================" << std::endl; } } private: //================= // Pop and run tasks in threads. //================ void run_tasks(void) { //To avoid branch misprediction, use array to store branch code instead of if-else std::unique_ptr<MyPackagedTask<>> func; std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front(); }; std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new MyPackagedTask<>); };//NOP std::function<void(void)> switch_func[2]{ branch_false, branch_true}; while (!m_done) { // Only wait if there are still tasks to be processed { bool empty; //Status of task queue std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty = m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown. } ++m_processing; (*func)(); --m_processing; } //Print out shutdown msg if (m_done & m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::this_thread::get_id() << " "; } } std::atomic<bool> m_done; std::atomic<bool> m_print_shutdown_msg; std::atomic<unsigned> m_max_num_threads; std::atomic<unsigned> m_num_threads; std::atomic<unsigned> m_processing; std::atomic<bool> m_cancel; std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool; std::vector< std::future<void> > m_thread_list; std::mutex m_threads; std::mutex m_tasks; std::mutex m_shutdown; std::condition_variable m_condition_variable; }; //===================================== // Simple test class // Creates a few tasks, pushes them onto thread pool, gets results //================================================================== struct SimpleTest { SimpleTest(void) try { std::cout << std::endl << "Simple Test......" << std::endl << std::endl; //Create thread pool ThreadPool thread_pool;
  • 6. 6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //create a task std::thread::id f1_id; std::function< int(int, int) > f1 = [&](int i, int j) { f1_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return i*j; }; //create another task std::thread::id f2_id; std::function< std::string(void) > f2 = [&](void) { f2_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return std::string("return value of f2"); }; //create another task std::thread::id f3_id; std::string f3_str; std::function< void(void) > f3 = [&](void) { f3_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); f3_str = "f3 called"; }; //push tasks auto start = std::chrono::high_resolution_clock::now(); //start timer std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20))); std::future<std::string> fut_2 = thread_pool.push(f2); int fut_1_res = fut_1.get(); std::string fut_2_res = fut_2.get(); auto end = std::chrono::high_resolution_clock::now(); //stop timer //std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error. // std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future <void> //std::future<void> test_fut; //compiles //std::future<void> test_fut1 = std::move(test_fut); //compiles //thread_pool.push(f3); // doesn't compile // std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles // thread_pool.push(f4, 2); //compiles // std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles // thread_pool.push(f4, 2); //doesn't compile //print num of threads running, thread id for tasks, result sent back by tasks std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl; std::cout << "f1 thread id=" << f1_id << std::endl; std::cout << "f1's result: " << fut_1_res << std::endl; std::cout << "f2 thread id=" << f2_id << std::endl; std::cout << "f2's result: " << fut_2_res << std::endl; //std::cout << "f3 thread id=" << f3_id << std::endl; //std::cout << "f3's result: " << f3_str << std::endl; std::cout << "thread_pool time = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std:: endl; //cleanup threads //thread_pool.shutdown(); test dtor } catch (...) { std::cout << "SimpleTest exception" << std::endl; } }; //============================================================== // Parallel vs binary search test class // t - number of threads ( > 1) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Binary = std::find (single threaded for comparison).
  • 7. 7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp // Parallel: (1) Split array into equal chuncks, push them onto thread pool // (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val <= end(). // (3) Chunk returning true replaces array and step(1) repeated. // N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match // Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is quicker if early match found). // Benchmarks show high overhead of thread pool. //=============================================================== struct ParallelSearch { //Choose which to run bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_simd = true; //run simd-based lambda //Choose number of threads //unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead) //const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription, should run slower than optimal const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal choice //const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate oversubsrciption, should run slower tha optimal //const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy oversubsrciption, should run slower tha optimal //const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive oversubsrciption, should run slower tha optimal ParallelSearch(void) try { std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion //Create large, sorted array on heap to avoid seg fault. const unsigned size = 2 << 24; std::vector<unsigned> my_array(size); for (auto& elem : my_array) { static unsigned i = 0; elem = 2 * i; //even numbers ++i; } //double-word atomic containing the address of a matching chunk and the new new chunk length (size, size/t, size/t^2 ...) struct DoubleWord { unsigned* m_address; unsigned m_chunk_length; }; std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads }); //val seacrched for (TODO: binary search faster than parallel search if binary finds early match. Need to terminate parallel search earlier) bool even = true; unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); // even/odd number --> found/not found //Variables for found position, passes taken and whether to printout progress(incurs overhead) unsigned* ret_val = &my_array[0]; int passes = 0; //int required by g++ autovectorize bool printout = false; //SIMD lambda (Proved to be quite difficult getting g++ to autovectorise) //(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // 1. Split array into t chunks // 2. Allocate chunks to t SIMD lanes // 3. Each lane checks chunk.begin() <= search-val <= chunk.end() // 4. The SIMD lane getting a match set array = chunk // 5. Steps 1 to 4 repeated until chunk is 1 element long.
  • 8. 8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp std::function<bool(void)> simd_search = [&]() { //Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register) const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi ?id=56787) //Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s)) alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15, size >> 18, size >> 21, size >> 24, 1}; //Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8, lower[n]/64 ... alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size, 7 * size}; //Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2}; //Running tally of start of chunk to be searched alignas(alignment) int offset = 0; alignas(alignment) int tmp_offset = 0; //Loop until chunk length is 1 element for (passes = 0; passes<9; ++passes) { //Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are due to memory stalls. //It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this search algorithm is unavoidably //memory-bound unless something along the lines of a heap-ordered array (i.e array is laid out as a breadth-first n-ary tree) is //used to convert random access to linear access without need for scatter-gather. //#pragma omp parallel for //Adds too much overhead [&]() //Sadly, won't vectorise due to function call { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data they will use { int tmp = pos*chunk_length[passes]; //Get lower index for chunk interval __builtin_prefetch(&my_array[0] + offset + tmp); //See if it removes hotspot from "LINE X", below __builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if it removes hotspot from "LINE Y", below } }(); //Fork: Assign each chunk to an SIMD lane //N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not vectorised. This does autovectorise under g++ 4.8 //N.B. Code has been borken down into painfully simple steps to help autovectoriser and pinpoint which operations are causing trouble [&]() { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. alignas(alignment) int chunk = chunk_length[passes]; for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes { //Find matching chunk by adding 0 to offset for no-match and chunk address for a match alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset //int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi? id=56787). Can't use 32-byte AVX. alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk range unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
  • 9. 9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp prefetch) unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above prefecth) alignas(alignment) bool test_lower = lower_val <= val; //Lower alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check alignas(alignment) bool test = test_lower && test_upper; // is search-val inside chunk for this SIMD lane? tmp_offset += test*tmp; //Horrible construct to get it to autovec. It masks out SIMD lanes that don't contain search val. //Following fails because it is "not suitable for gather" (whatever that means) //offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+ chunk_length[passes]-1]))*tmp; //Following fails because of "control flow" (Can't see why g++ doesn't autovec it, control flow can be reaplced with masked op) //if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length [passes]-1])) tmp_offset = tmp; } }(); //Join: end of SIMD //Update chunk start address index offset = tmp_offset; /*std::cout << "offset=" << offset << std::endl; std::cout << "passes=" << passes; std::cout << ", val=" << val; std::cout << ", range=[" << array[offset] << "," << array[offset+1]; std::cout<< ", chunk length=" << chunk_length[passes] << std::endl; */ } //Update final index of search-val ret_val = &my_array[0] + offset; return true; }; //Lock-free lambda for each thead //Operation: //1. The array is split into t (num of threads) chunks //2. Each thread examines its chunk //3. If a match is found in a chunk, the thread changes the array to be that chunk. //4. The process repeated from step 1. //t threads continous monitor the array and process their chunk of the array. Since the array pointer is // atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is picked up // by all threads. No synchronisation is needed. // arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+ chunk_length/t, begin+2*chunk_length/t], ..) std::atomic<unsigned> running_threads(0); std::atomic<bool> go(false); std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos) { //Increment running thread count ++running_threads; //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Keep searching until a thread notifies completion. while (!done) { //capture chunk address and length DoubleWord capture = chunk_address_and_length; //Check if search-val between chunk.begin() and chunk.end() unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length; unsigned *end = begin + capture.m_chunk_length - 1; unsigned test1 = *begin, test2 = *end; if (*begin <= val && val <= *end)
  • 10. 10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { //Print out iterations (adds significant overhead) static std::mutex printout_mutex; if (printout) { std::unique_lock<std::mutex> lock(printout_mutex); //Print out iterations (adds significant overhead) std::cout << "Parallel find (pass " << passes << "): Closest match " << *begin << "<=" << val << "<=" << *end << ", chunk length=" << capture.m_chunk_length << std::endl; } //Update parent variables for printouts ret_val = begin; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads) std::function<void(void)> branch_true = [&]() //IF { //Update chunk length and address capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture. m_chunk_length / num_threads) : 1); //divide chunk evenly capture.m_address = begin; //point to this chunk chunk_address_and_length = capture; }; std::function<void(void)> branch_false = [&]() //ELSE { done = true; //notify parent and sister threads of completion }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[capture.m_chunk_length > 1](); //if-else } else { std::this_thread::sleep_for(std::chrono::nanoseconds(5000)); } } return true;; }; //Create thread pool for lock-based search static ThreadPool thread_pool(num_threads); //Notification of completion of lock-based search std::condition_variable finished; // lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos + chuhnk_length] // and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion point. std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) // { //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Check if search-val between chunk.begin() and chunk.end() if (*tmp <= val && val <= *(tmp + chunk_length - 1)) { //Print out iterations (adds significant overhead) if (printout) { std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" << val << "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp + 1)) << ", chunk length=" << chunk_length << std::endl; } //Update parent variables for printouts ret_val = tmp; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads)
  • 11. 11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //Spawn new tasks to process this chunk //Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to replace if-else //need VTune to test out whether it saves us any mispredictions. std::function<void(void)> branch_true = [&]() //IF { chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1); //divide chunk evenly for (unsigned index = 0; index < num_threads; ++index) { thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length); } }; std::function<void(void)> branch_false = [&]() //ELSE { finished.notify_one(); //chunk length is 1, so we are finished dividing-and- conquering }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[chunk_length>1](); //if-else } return true; }; std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl; //Obtain position of element (to verify parallel search finds correct position). auto pos = std::find(my_array.begin(), my_array.end(), val); //Ordinary binary search for timing comparison std::cout << std::endl << "========================================================================= =====" << std::endl; std::cout << "Running binary search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std ::endl; unsigned factor = 10000; //number of times to run search auto start_binary = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); // binary search VTUNE(__itt_pause();) auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer //print out results of binary search using std::chrono::duration_cast; using std::chrono::nanoseconds; std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock:: period::num) << " ns" << std::endl; std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ", index=" << pos - my_array.begin() << ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast <nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl; //Parallel searches //SIMD search if (is_simd) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; //Kick off the search auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "simd_search()", for (unsigned i = 0; i<factor; i++) simd_search(); ) //Wait for result and then get the intertion point and number of paasses
  • 12. 12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count(); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Simd results:" << std::endl; std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl; std::cout << "Search repeated " << factor << " times" << std::endl; std::cout << "number of threads=" << running_threads << std::endl; std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] << "]"; std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl; std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl; std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns"; std::cout << " = "; std::cout << parallel_time / binary_time << std::endl; std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std:: endl; } //Lock-free multithreaded search if (is_lock_free) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads }; done = false; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer std::vector<std::future<bool>> futures; go = false; futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0))); for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos) { futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos))); } //Wait for result and then get the intertion point and number of paasses auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_free()", go = true; futures[0].get(); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); } //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock free results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl
  • 13. 13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = " << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } if (is_lock_based) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer go = false; auto f = thread_pool.push(lock_based, &my_array[0], size); //Wait for result and then get the intertion point and number of paasses { //wait for completion std::mutex dummy; std::unique_lock<std::mutex> lock(dummy); auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_based()", go = true; finished.wait(lock); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); thread_pool.cancel_tasks(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } //kill thread pool thread_pool.shutdown(false); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock based results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = "
  • 14. 14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } } catch (...) { std::cout << "ParallelSearch exception" << std::endl; } }; int main(void) { //SimpleTest simple_test; VTUNE(__itt_pause();) ParallelSearch parallel_search; char c; std::cout << "Press any key to exit" << std::endl; std::cin >> c; //keep console alive }