From 7c3498dc436badf7d1d57769b1de55a5496f77e7 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Mon, 2 Sep 2019 16:50:06 +0300 Subject: [PATCH] Samples: Add SMP Pi This sample showcases efficient utilization of SMP system with processing of independent resource-hungry workloads. With no cross-dependencies between workers and no usage of shared resources (during heavy-lifting itself) we may demonstrate almost linear scaling of efficiency. I.e. 2 cores do the same amount of calculations twice faster than only 1 core. 4 cores complete the same calculations 2 times faster than 2 cores. Signed-off-by: Alexey Brodkin --- samples/index.rst | 1 + samples/smp/index.rst | 10 +++ samples/smp/pi/CMakeLists.txt | 8 +++ samples/smp/pi/README.rst | 46 ++++++++++++++ samples/smp/pi/prj.conf | 5 ++ samples/smp/pi/sample.yaml | 18 ++++++ samples/smp/pi/src/main.c | 112 ++++++++++++++++++++++++++++++++++ 7 files changed, 200 insertions(+) create mode 100644 samples/smp/index.rst create mode 100644 samples/smp/pi/CMakeLists.txt create mode 100644 samples/smp/pi/README.rst create mode 100644 samples/smp/pi/prj.conf create mode 100644 samples/smp/pi/sample.yaml create mode 100644 samples/smp/pi/src/main.c diff --git a/samples/index.rst b/samples/index.rst index 7492cf8b6b1..dde28aaca84 100644 --- a/samples/index.rst +++ b/samples/index.rst @@ -25,6 +25,7 @@ Samples and Demos posix/* gui/* video/* + smp/* .. comment To add a new sample document, please use the template available under diff --git a/samples/smp/index.rst b/samples/smp/index.rst new file mode 100644 index 00000000000..64c4660aaed --- /dev/null +++ b/samples/smp/index.rst @@ -0,0 +1,10 @@ +.. _smp-samples: + +Various SMP Samples +################### + +.. toctree:: + :maxdepth: 1 + :glob: + + **/* diff --git a/samples/smp/pi/CMakeLists.txt b/samples/smp/pi/CMakeLists.txt new file mode 100644 index 00000000000..c8c1cf19f55 --- /dev/null +++ b/samples/smp/pi/CMakeLists.txt @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.13.1) + +include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE) +project(smp_pi) + +target_sources(app PRIVATE src/main.c) diff --git a/samples/smp/pi/README.rst b/samples/smp/pi/README.rst new file mode 100644 index 00000000000..432e9d9292a --- /dev/null +++ b/samples/smp/pi/README.rst @@ -0,0 +1,46 @@ +.. _smp_pi: + +SMP Pi +########### + +Overview +******** +This sample application calculates Pi independently in many threads, and +demonstrates the benefit of multiple execution units (CPU cores) +when compute-intensive tasks can be run in parallel, with +no cross-dependencies or shared resources. + +By changing the value of CONFIG_MP_NUM_CPUS on SMP systems, you +can see that using more cores takes almost linearly less time +to complete the computational task. + +You can also edit the sample source code to change the +number of digits calculated (``DIGITS_NUM``), and the +number of threads to use (``THREADS_NUM``). + +Building and Running +******************** + +This project outputs Pi values calculated by each thread and in the end total time +required for all the calculation to be done. It can be built and executed +on Synopsys ARC HSDK board as follows: + +.. zephyr-app-commands:: + :zephyr-app: samples/smp_pi + :host-os: unix + :board: qemu_x86_64 + :goals: run + :compact: + +Sample Output +============= + +.. code-block:: console + + Calculate first 240 digits of Pi independently by 16 threads. + Pi value calculated by thread #0: 3141592653589793238462643383279502884197... + Pi value calculated by thread #1: 3141592653589793238462643383279502884197... + ... + Pi value calculated by thread #14: 314159265358979323846264338327950288419... + Pi value calculated by thread #15: 314159265358979323846264338327950288419... + All 16 threads executed by 4 cores in 28 msec diff --git a/samples/smp/pi/prj.conf b/samples/smp/pi/prj.conf new file mode 100644 index 00000000000..7829564f9ad --- /dev/null +++ b/samples/smp/pi/prj.conf @@ -0,0 +1,5 @@ +# Allow worker threads to capture all resources +CONFIG_MAIN_THREAD_PRIORITY=11 + +# Enable SMP +CONFIG_SMP=y diff --git a/samples/smp/pi/sample.yaml b/samples/smp/pi/sample.yaml new file mode 100644 index 00000000000..63b44ce50ff --- /dev/null +++ b/samples/smp/pi/sample.yaml @@ -0,0 +1,18 @@ +sample: + description: Calculation of Pi independently in + a number of threads + name: SMP Pi +common: + tags: introduction + harness: console + harness_config: + type: multi_line + ordered: yes + regex: + - "Calculate first [0-9]+ digits of Pi independently by [0-9]+ threads.(.*)" + - "Pi value calculated by thread #[0-9]+: [0-9]+(.*)" + - "All [0-9]+ threads executed by [0-9]+ cores in [0-9]+ msec(.*)" +tests: + sample.smp_pi: + tags: introduction + platform_whitelist: nsim_hs_smp qemu_x86_64 diff --git a/samples/smp/pi/src/main.c b/samples/smp/pi/src/main.c new file mode 100644 index 00000000000..f100ae7818a --- /dev/null +++ b/samples/smp/pi/src/main.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2019 Synopsys, Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +/* Amount of execution threads to create and run */ +#define THREADS_NUM 16 + +/* + * Amount of digits of Pi to calculate, must be a multiple of 4, + * as used algorythm spits 4 digits on every iteration. + */ +#define DIGITS_NUM 240 + +#define LENGTH ((DIGITS_NUM / 4) * 14) +#define STACK_SIZE (LENGTH * sizeof(int) + 512) + +#ifdef CONFIG_SMP +#define CORES_NUM CONFIG_MP_NUM_CPUS +#else +#define CORES_NUM 1 +#endif + +static K_THREAD_STACK_ARRAY_DEFINE(tstack, THREADS_NUM, STACK_SIZE); +static struct k_thread tthread[THREADS_NUM]; +static char buffer[THREADS_NUM][DIGITS_NUM + 1]; +static atomic_t counter = THREADS_NUM; + +void test_thread(void *arg1, void *arg2, void *arg3) +{ + atomic_t *counter = (atomic_t *)arg1; + char *buffer = (char *)arg2; + + ARG_UNUSED(arg3); + + /* + * Adapted and improved (for random number of digits) version of Pi + * calculation program initially proposed by Dik T. Winter as: + * -------------------------------->8-------------------------------- + * int a=10000,b,c=2800,d,e,f[2801],g;main(){for(;b-c;)f[b++]=a/5; + * for(;d=0,g=c*2;c-=14,printf("%.4d",e+d/a),e=d%a)for(b=c;d+=f[b]*a, + * f[b]=d%--g,d/=g--,--b;d*=b);} + * -------------------------------->8-------------------------------- + */ + #define NEW_BASE 10000 + #define ARRAY_INIT 2000 + + int array[LENGTH + 1] = {}; + int carry = 0; + int i, j; + + for (i = 0; i < LENGTH; i++) + array[i] = ARRAY_INIT; + + for (i = LENGTH; i > 0; i -= 14) { + int sum = 0, value; + + for (j = i; j > 0; --j) { + sum = sum * j + NEW_BASE * array[j]; + array[j] = sum % (j * 2 - 1); + sum /= j * 2 - 1; + } + + value = carry + sum / NEW_BASE; + carry = sum % NEW_BASE; + + /* Convert 4-digit int to string */ + sprintf(buffer, "%.4d", value); + buffer += 4; + } + + atomic_dec(counter); +} + +void main(void) +{ + u32_t start_time, stop_time, cycles_spent, nanoseconds_spent; + int i; + + printk("Calculate first %d digits of Pi independently by %d threads.\n", + DIGITS_NUM, THREADS_NUM); + + /* Capture initial time stamp */ + start_time = k_cycle_get_32(); + + for (i = 0; i < THREADS_NUM; i++) { + k_thread_create(&tthread[i], tstack[i], STACK_SIZE, + (k_thread_entry_t)test_thread, + (void *)&counter, (void *)buffer[i], NULL, + K_PRIO_COOP(10), 0, K_NO_WAIT); + } + + /* Wait for all workers to finish their calculations */ + while (counter) + k_sleep(1); + + /* Capture final time stamp */ + stop_time = k_cycle_get_32(); + + cycles_spent = stop_time - start_time; + nanoseconds_spent = SYS_CLOCK_HW_CYCLES_TO_NS(cycles_spent); + + for (i = 0; i < THREADS_NUM; i++) + printk("Pi value calculated by thread #%d: %s\n", i, buffer[i]); + + printk("All %d threads executed by %d cores in %d msec\n", THREADS_NUM, + CORES_NUM, nanoseconds_spent / 1000 / 1000); +}