Samples: Add SMP Pi

This sample showcases efficient utilization of SMP system with processing of independent resource-hungry workloads. With no cross-dependencies between workers and no usage of shared resources (during heavy-lifting itself) we may demonstrate almost linear scaling of efficiency. I.e. 2 cores do the same amount of calculations twice faster than only 1 core. 4 cores complete the same calculations 2 times faster than 2 cores. Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
2019-09-02 16:50:06 +03:00 · 2019-09-02 16:50:06 +03:00 · 7c3498dc43
parent 265b6ff59e
commit 7c3498dc43
7 changed files with 200 additions and 0 deletions
--- a/samples/index.rst
+++ b/samples/index.rst
@ -25,6 +25,7 @@ Samples and Demos
   posix/*
   gui/*
   video/*
   smp/*
 .. comment
   To add a new sample document, please use the template available under
--- a/samples/smp/index.rst
+++ b/samples/smp/index.rst
@ -0,0 +1,10 @@
 .. _smp-samples:
 Various SMP Samples
 ###################
 .. toctree::
   :maxdepth: 1
   :glob:
   **/*
--- a/samples/smp/pi/CMakeLists.txt
+++ b/samples/smp/pi/CMakeLists.txt
@ -0,0 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 cmake_minimum_required(VERSION 3.13.1)
 include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
 project(smp_pi)
 target_sources(app PRIVATE src/main.c)
--- a/samples/smp/pi/README.rst
+++ b/samples/smp/pi/README.rst
@ -0,0 +1,46 @@
 .. _smp_pi:
 SMP Pi
 ###########
 Overview
 ********
 This sample application calculates Pi independently in many threads, and
 demonstrates the benefit of multiple execution units (CPU cores)
 when compute-intensive tasks can be run in parallel, with
 no cross-dependencies or shared resources.
 By changing the value of CONFIG_MP_NUM_CPUS on SMP systems, you
 can see that using more cores takes almost linearly less time
 to complete the computational task.
 You can also edit the sample source code to change the
 number of digits calculated (``DIGITS_NUM``), and the
 number of threads to use (``THREADS_NUM``).
 Building and Running
 ********************
 This project outputs Pi values calculated by each thread and in the end total time
 required for all the calculation to be done. It can be built and executed
 on Synopsys ARC HSDK board as follows:
 .. zephyr-app-commands::
   :zephyr-app: samples/smp_pi
   :host-os: unix
   :board: qemu_x86_64
   :goals: run
   :compact:
 Sample Output
 =============
 .. code-block:: console
    Calculate first 240 digits of Pi independently by 16 threads.
    Pi value calculated by thread #0: 3141592653589793238462643383279502884197...
    Pi value calculated by thread #1: 3141592653589793238462643383279502884197...
    ...
    Pi value calculated by thread #14: 314159265358979323846264338327950288419...
    Pi value calculated by thread #15: 314159265358979323846264338327950288419...
    All 16 threads executed by 4 cores in 28 msec
--- a/samples/smp/pi/prj.conf
+++ b/samples/smp/pi/prj.conf
@ -0,0 +1,5 @@
 # Allow worker threads to capture all resources
 CONFIG_MAIN_THREAD_PRIORITY=11
 # Enable SMP
 CONFIG_SMP=y
--- a/samples/smp/pi/sample.yaml
+++ b/samples/smp/pi/sample.yaml
@ -0,0 +1,18 @@
 sample:
  description: Calculation of Pi independently in
    a number of threads
  name: SMP Pi
 common:
    tags: introduction
    harness: console
    harness_config:
      type: multi_line
      ordered: yes
      regex:
        - "Calculate first [0-9]+ digits of Pi independently by [0-9]+ threads.(.*)"
        - "Pi value calculated by thread #[0-9]+: [0-9]+(.*)"
        - "All [0-9]+ threads executed by [0-9]+ cores in [0-9]+ msec(.*)"
 tests:
  sample.smp_pi:
    tags: introduction
    platform_whitelist: nsim_hs_smp qemu_x86_64
--- a/samples/smp/pi/src/main.c
+++ b/samples/smp/pi/src/main.c
@ -0,0 +1,112 @@
 /*
 * Copyright (c) 2019 Synopsys, Inc.
 *
 * SPDX-License-Identifier: Apache-2.0
 */
 #include <zephyr.h>
 #include <stdio.h>
 /* Amount of execution threads to create and run */
 #define THREADS_NUM	16
 /*
 * Amount of digits of Pi to calculate, must be a multiple of 4,
 * as used algorythm spits 4 digits on every iteration.
 */
 #define DIGITS_NUM	240
 #define LENGTH		((DIGITS_NUM / 4) * 14)
 #define STACK_SIZE	(LENGTH * sizeof(int) + 512)
 #ifdef CONFIG_SMP
 #define CORES_NUM	CONFIG_MP_NUM_CPUS
 #else
 #define CORES_NUM	1
 #endif
 static K_THREAD_STACK_ARRAY_DEFINE(tstack, THREADS_NUM, STACK_SIZE);
 static struct k_thread tthread[THREADS_NUM];
 static char buffer[THREADS_NUM][DIGITS_NUM + 1];
 static atomic_t counter = THREADS_NUM;
 void test_thread(void *arg1, void *arg2, void *arg3)
 {
 	atomic_t *counter = (atomic_t *)arg1;
 	char *buffer = (char *)arg2;
 	ARG_UNUSED(arg3);
 	/*
 	 * Adapted and improved (for random number of digits) version of Pi
 	 * calculation program initially proposed by Dik T. Winter as:
 	 * -------------------------------->8--------------------------------
 	 * int a=10000,b,c=2800,d,e,f[2801],g;main(){for(;b-c;)f[b++]=a/5;
 	 * for(;d=0,g=c*2;c-=14,printf("%.4d",e+d/a),e=d%a)for(b=c;d+=f[b]*a,
 	 * f[b]=d%--g,d/=g--,--b;d*=b);}
 	 * -------------------------------->8--------------------------------
 	 */
 	#define NEW_BASE	10000
 	#define ARRAY_INIT	2000
 	int array[LENGTH + 1] = {};
 	int carry = 0;
 	int i, j;
 	for (i = 0; i < LENGTH; i++)
 		array[i] = ARRAY_INIT;
 	for (i = LENGTH; i > 0; i -= 14) {
 		int sum = 0, value;
 		for (j = i; j > 0; --j) {
 			sum = sum * j + NEW_BASE * array[j];
 			array[j] = sum % (j * 2 - 1);
 			sum /= j * 2 - 1;
 		}
 		value = carry + sum / NEW_BASE;
 		carry = sum % NEW_BASE;
 		/* Convert 4-digit int to string */
 		sprintf(buffer, "%.4d", value);
 		buffer += 4;
 	}
 	atomic_dec(counter);
 }
 void main(void)
 {
 	u32_t start_time, stop_time, cycles_spent, nanoseconds_spent;
 	int i;
 	printk("Calculate first %d digits of Pi independently by %d threads.\n",
 	       DIGITS_NUM, THREADS_NUM);
 	/* Capture initial time stamp */
 	start_time = k_cycle_get_32();
 	for (i = 0; i < THREADS_NUM; i++) {
 		k_thread_create(&tthread[i], tstack[i], STACK_SIZE,
 			       (k_thread_entry_t)test_thread,
 			       (void *)&counter, (void *)buffer[i], NULL,
 			       K_PRIO_COOP(10), 0, K_NO_WAIT);
 	}
 	/* Wait for all workers to finish their calculations */
 	while (counter)
 		k_sleep(1);
 	/* Capture final time stamp */
 	stop_time = k_cycle_get_32();
 	cycles_spent = stop_time - start_time;
 	nanoseconds_spent = SYS_CLOCK_HW_CYCLES_TO_NS(cycles_spent);
 	for (i = 0; i < THREADS_NUM; i++)
 		printk("Pi value calculated by thread #%d: %s\n", i, buffer[i]);
 	printk("All %d threads executed by %d cores in %d msec\n", THREADS_NUM,
 	       CORES_NUM, nanoseconds_spent / 1000 / 1000);
 }