538 lines
21 KiB
C
538 lines
21 KiB
C
/**
|
|
* Copyright 2018 Au-Zone Technologies
|
|
* All rights reserved.
|
|
*
|
|
* Software that is described herein is for illustrative purposes only which
|
|
* provides customers with programming information regarding the DeepViewRT
|
|
* library. This software is supplied "AS IS" without any warranties of any
|
|
* kind, and Au-Zone Technologies and its licensor disclaim any and all
|
|
* warranties, express or implied, including all implied warranties of
|
|
* merchantability, fitness for a particular purpose and non-infringement of
|
|
* intellectual property rights. Au-Zone Technologies assumes no responsibility
|
|
* or liability for the use of the software, conveys no license or rights under
|
|
* any patent, copyright, mask work right, or any other intellectual property
|
|
* rights in or to any products. Au-Zone Technologies reserves the right to make
|
|
* changes in the software without notification. Au-Zone Technologies also makes
|
|
* no representation or warranty that such application will be suitable for the
|
|
* specified use without further testing or modification.
|
|
*
|
|
* Permission to use, copy, modify, and distribute this software and its
|
|
* documentation requires authorization from Au-Zone Technologies which is
|
|
* available free of charge by visiting https://embeddedml.com/deepview-samples
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <math.h>
|
|
|
|
#include "fsl_device_registers.h"
|
|
#include "fsl_debug_console.h"
|
|
#include "clock_config.h"
|
|
#include "board_init.h"
|
|
#include "board.h"
|
|
|
|
#include <cr_section_macros.h>
|
|
|
|
#include "deepview_rt.h"
|
|
#include "deepview_ops.h"
|
|
|
|
|
|
/*******************************************************************************
|
|
* Definitions
|
|
******************************************************************************/
|
|
|
|
/**
|
|
* If LAYER_TIMING is set to 1 the main loop will dump the
|
|
* timing information for all non-constant layers.
|
|
*/
|
|
#define LAYER_TIMING 0
|
|
|
|
/**
|
|
* If LOAD_MODEL_TO_SDRAM is set to 1 then the model will be
|
|
* copied from FLASH into SDRAM before being loaded.
|
|
*/
|
|
#define LOAD_MODEL_TO_SDRAM 0
|
|
|
|
/**
|
|
* The cache is used by DeepViewRT to optimize certain internal
|
|
* loops. It is optional but if used should be placed in the
|
|
* fastest available memories, in this case we use the SRAM_DTC.
|
|
*/
|
|
#if defined(CPU_MIMXRT1176DVMAA_cm7)
|
|
#define CACHE_SIZE 256 * 1024
|
|
#else
|
|
/* 128K cache size is workable for default model */
|
|
#define CACHE_SIZE 128 * 1024
|
|
#endif
|
|
|
|
/**
|
|
* 0 - tf1.x mobilenetv1/v2-ssd
|
|
* 1 - eIQ ssd
|
|
*/
|
|
#define SSD_MODEL_VERSION 0
|
|
|
|
/**
|
|
* The mempool holds the intermediate buffers for evaluating the
|
|
* model. This buffer can be multiple megabytes in size and therefore
|
|
* should be stored in the SDRAM. You may adjust this size if your
|
|
* particular model requires more or less memory as reported by the
|
|
* conversion tool. If insufficient memory is provided an error will
|
|
* be reported by nn_context_model_load().
|
|
*/
|
|
/**
|
|
*The maximum runtime memroy size,
|
|
*adjust it according model's runtime size and board's SDRAM size.
|
|
*/
|
|
#define MEMPOOL_SIZE 5 * 1024 * 1024
|
|
|
|
|
|
/*******************************************************************************
|
|
* Prototypes
|
|
******************************************************************************/
|
|
NN_API NNError
|
|
nn_ssd_decode_nms_standard_bbx(NNTensor* score_tensor,
|
|
NNTensor* trans,
|
|
NNTensor* anchors,
|
|
NNTensor* cache,
|
|
float score_threshold,
|
|
float iou_threshold,
|
|
int32_t max_output_size,
|
|
NNTensor* bbx_out_tensor,
|
|
NNTensor* bbx_out_dim_tensor);
|
|
|
|
|
|
NN_API NNError
|
|
nn_ssd_decode_nms_variance_bbx(NNTensor* prediction,
|
|
NNTensor* anchors,
|
|
NNTensor* cache,
|
|
float score_threshold,
|
|
float iou_threshold,
|
|
int32_t max_output_size_per_class,
|
|
NNTensor* bbx_out_tensor,
|
|
NNTensor* bbx_out_dim_tensor);
|
|
|
|
/*******************************************************************************
|
|
* Variables
|
|
******************************************************************************/
|
|
|
|
#if defined ( __ICCARM__ ) /* for iar toolchain */
|
|
extern const unsigned char model_rtm_start[];
|
|
extern const unsigned char sample_img_start[];
|
|
#else
|
|
/* DeepViewRT Model definition from model.S */
|
|
extern const unsigned char model_rtm_start;
|
|
extern const unsigned char model_rtm_end;
|
|
|
|
/* Sample image definition from model.S */
|
|
extern const unsigned char sample_img_start;
|
|
extern const unsigned char sample_img_end;
|
|
#endif
|
|
|
|
/**
|
|
* The DeepViewRT Cache buffer stored in SRAM_DTC for maximum performance.
|
|
*/
|
|
//__BSS(SRAM_DTC_cm7) uint8_t cache[CACHE_SIZE] __attribute__((aligned(32)));
|
|
uint8_t *cache = (uint8_t*)(0x20000000); //Cache in DTCM; works for 1170 and 106x
|
|
|
|
/**
|
|
* The DeepViewRT Memory Pool buffer holds intermediate buffers and is
|
|
* stored in SDRAM for maximum storage space.
|
|
*/
|
|
__BSS(BOARD_SDRAM) uint8_t mempool[MEMPOOL_SIZE] __attribute__((aligned(32)));
|
|
|
|
#if LOAD_MODEL_TO_SDRAM
|
|
/**
|
|
* MEMBLOB_SIZE needs to be at least as large as the RTM model file.
|
|
*/
|
|
#define MEMBLOB_SIZE 10 * 1024 * 1024
|
|
__BSS(BOARD_SDRAM) uint8_t memblob[MEMBLOB_SIZE] __attribute__((aligned(32)));
|
|
#endif
|
|
|
|
static NNEngine* engine = NULL;
|
|
|
|
/**
|
|
* SysTick_Handler triggers every millisecond and increments the
|
|
* g_systickCounter.
|
|
*/
|
|
volatile int32_t g_systickCounter = 0;
|
|
|
|
void SysTick_Handler(void)
|
|
{
|
|
g_systickCounter++;
|
|
}
|
|
|
|
/**
|
|
* This symbol is required by DeepViewRT for internal time keeping and MUST
|
|
* return a 64-bit signed integer of continuous nanoseconds. The epoch is
|
|
* not important but the counter should never reset during model evaluation.
|
|
*
|
|
* This sample has no overflow protection so after 2^31 milliseconds it will
|
|
* wrap around. This will generally not be a problem though production software
|
|
* should have more intelligent tracking.
|
|
*/
|
|
int64_t os_clock_now()
|
|
{
|
|
return ((int64_t) g_systickCounter) * (int) 1e6;
|
|
}
|
|
|
|
/*!
|
|
* @brief Main function
|
|
*/
|
|
int main(void)
|
|
{
|
|
/* Init board hardware. */
|
|
BOARD_Init();
|
|
|
|
/**
|
|
* Initialize the SysTick to fire every milliseconds. If this is adjusted the os_clock_now()
|
|
* must also be adjusted accordingly to continue reporting nanoseconds.
|
|
*/
|
|
SysTick_Config(SystemCoreClock / 1000U);
|
|
|
|
PRINTF("==========================================================================\r\n");
|
|
PRINTF(" DeepviewRT Image Detection Demo\r\n");
|
|
PRINTF("===========================================================================\r\n");
|
|
//PRINTF("CPU: %d Hz\r\n", CLOCK_GetFreq(kCLOCK_CpuClk));
|
|
//printf("AHB: %d Hz\r\n", CLOCK_GetFreq(kCLOCK_AhbClk));
|
|
//PRINTF("SEMC: %d Hz\r\n", CLOCK_GetFreq(kCLOCK_Semc));
|
|
|
|
/**
|
|
* The model and model_size will be setup at startup based on the model_rtm_start
|
|
* and model_rtm_end variables from the model.S file.
|
|
*/
|
|
#if defined ( __ICCARM__ ) /* for iar toolchain */
|
|
const uint8_t *model = model_rtm_start;
|
|
int model_size = 8 * 1024 * 1024;
|
|
#else
|
|
const uint8_t *model_end = &model_rtm_end;
|
|
const uint8_t *model = &model_rtm_start;
|
|
int model_size = model_end - model;
|
|
#endif
|
|
if (model_size < 1) {
|
|
PRINTF("[ERROR] invalid model_size (%d) verify model.S implementation.\r\n", model_size);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
/**
|
|
* Just like model above, but from sample_img_start/sample_img_end.
|
|
*/
|
|
#if defined ( __ICCARM__ ) /* for iar toolchain */
|
|
const uint8_t *sample_image = sample_img_start;
|
|
int sample_image_size = 200 * 1024;
|
|
#else
|
|
const uint8_t *image_end = &sample_img_end;
|
|
const uint8_t *sample_image = &sample_img_start;
|
|
int sample_image_size = image_end - sample_image;
|
|
#endif
|
|
if (sample_image_size < 1) {
|
|
PRINTF("[ERROR] invalid sample_image_size (%d) verify model.S implementation.\r\n", sample_image_size);
|
|
return EXIT_FAILURE;
|
|
}
|
|
/**
|
|
* The NNContext structure holds runtime model data including the memory pool
|
|
* and optional cache. The first parameter is for the engine which is not used
|
|
* on MCU devices, but the same API is provided across all platforms.
|
|
*
|
|
* If MEMPOOL_SIZE is 0 then each layer's tensor will be allocated on the heap
|
|
* using malloc instead of using the optimized memory map. If MEMPOOL_SIZE is
|
|
* greater than 0 but mempool is NULL then the pool will be allocated from the
|
|
* heap and the optimized memory map WILL be used.
|
|
*
|
|
* If CACHE_SIZE is 0 then no cache will be used, convolutions especially will
|
|
* take significantly longer. If CACHE_SIZE is greater than 0 but cache is
|
|
* NULL then it will be allocated on the heap. This will provide better performance
|
|
* but will depend on the performance of heap memory, if this is SDRAM the model
|
|
* will take approximately 10x longer than if the cache is located in SRAM_DTC.
|
|
*
|
|
* If the heap is too small for the configuration context will be NULL.
|
|
*/
|
|
NNContext *context = nn_context_init(NULL,
|
|
MEMPOOL_SIZE, mempool,
|
|
CACHE_SIZE, cache);
|
|
if (!context) {
|
|
PRINTF("[ERROR] insufficient memory to create context\r\n");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
#if LOAD_MODEL_TO_SDRAM
|
|
if(model_size < MEMBLOB_SIZE){
|
|
memcpy(memblob,model,model_size);
|
|
model = (const uint8_t*)memblob;
|
|
PRINTF("Model loaded to SDRAM...\r\n");
|
|
} else {
|
|
PRINTF("Model too large (%d) for SDRAM buffer (%d)\r\n", model_size, MEMBLOB_SIZE);
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Loads the model into the context. If the model is invalid because of corruption
|
|
* or alignment an error will be returned. If the provided MEMPOOL_SIZE is insufficient
|
|
* an error will also be returned. Many of these internal errors will also be logged
|
|
* through stderr.
|
|
*/
|
|
NNError err = nn_context_model_load(context, (size_t) model_size, model);
|
|
if (err) {
|
|
PRINTF("[ERROR] failed to load model: %s\r\n", nn_strerror(err));
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
/**
|
|
* Acquire the input tensor, will be used for loading the sample image into the model.
|
|
*/
|
|
float threshold = 0.5, nms_threshold=0.6;
|
|
NNTensor *input=NULL, *anchor=NULL, *trans_tensor=NULL, *score_tensor=NULL;
|
|
NNTensor *prediction_tensor=NULL;
|
|
int class_num = 0, max_boxes = 50;
|
|
|
|
if(SSD_MODEL_VERSION==0) {
|
|
input = nn_context_tensor(context, "Preprocessor/sub");
|
|
if (!input) {
|
|
PRINTF("failed to load layer '%s' from model\n", "input");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
anchor = nn_context_tensor(context, "ssd_anchor_boxes");
|
|
if (!anchor) {
|
|
PRINTF("failed to load layer '%s' from model\n", "anchor");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
trans_tensor = nn_context_tensor(context, "concat");
|
|
if (!trans_tensor) {
|
|
PRINTF("failed to load layer '%s' from model\n", "output2s");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
score_tensor = nn_context_tensor(context, "concat_1");
|
|
if (!score_tensor) {
|
|
PRINTF("failed to load layer '%s' from model\n",
|
|
"score_tensor");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
const int32_t* score_tensor_shape = nn_tensor_shape(score_tensor);
|
|
class_num = score_tensor_shape[2];
|
|
} else {
|
|
|
|
input = nn_context_tensor(context, "input_1");
|
|
if (!input) {
|
|
PRINTF("failed to load layer '%s' from model\n", "input");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
anchor = nn_context_tensor(context, "ssd_anchor_boxes");
|
|
if (!anchor) {
|
|
PRINTF("failed to load layer '%s' from model\n", "anchor");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
prediction_tensor = nn_context_tensor(context, "Identity");
|
|
if (!prediction_tensor) {
|
|
PRINTF("failed to load layer '%s' from model\n", "output");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
const int32_t* prediction_tensor_shape = nn_tensor_shape(prediction_tensor);
|
|
class_num = prediction_tensor_shape[2] - 4;
|
|
}
|
|
|
|
/**
|
|
* Set imgproc mode for float input type. For quantized models imgproc is not used.
|
|
*/
|
|
uint32_t proc = nn_tensor_type(input) == NNTensorType_F32
|
|
? NN_IMAGE_PROC_SIGNED_NORM
|
|
: 0;
|
|
|
|
char bbx_out_tensor_mem[NN_TENSOR_SIZEOF];
|
|
NNTensor* bbx_out_tensor = nn_tensor_init(bbx_out_tensor_mem, engine);
|
|
float* data_bbx_out = NULL;
|
|
int32_t shape_bbx_out[4];
|
|
shape_bbx_out[0] = class_num;
|
|
shape_bbx_out[1] = max_boxes;
|
|
shape_bbx_out[2] = 4;
|
|
|
|
data_bbx_out = (float*) calloc(4 * max_boxes * class_num, sizeof(float));
|
|
if (data_bbx_out == NULL) {
|
|
PRINTF("failed to calloc data_bbx_out");
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
err = nn_tensor_assign(bbx_out_tensor,
|
|
NNTensorType_F32,
|
|
3,
|
|
shape_bbx_out,
|
|
data_bbx_out);
|
|
if (err)
|
|
PRINTF("failed to assign bbx_out tensor: %s\n",
|
|
nn_strerror(err));
|
|
|
|
char bbx_out_dim_tensor_mem[NN_TENSOR_SIZEOF];
|
|
NNTensor* bbx_out_dim_tensor = nn_tensor_init(bbx_out_dim_tensor_mem, engine);
|
|
int32_t* data_bbx_out_dim = (int32_t*) calloc(class_num, sizeof(int32_t));
|
|
if (data_bbx_out_dim == NULL) {
|
|
PRINTF("failed to calloc data_bbx_out_dim");
|
|
free(data_bbx_out);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
int32_t shape_bbx_out_dim[2];
|
|
shape_bbx_out_dim[0] = class_num;
|
|
shape_bbx_out_dim[1] = 1;
|
|
err = nn_tensor_assign(bbx_out_dim_tensor,
|
|
NNTensorType_I32,
|
|
2,
|
|
shape_bbx_out_dim,
|
|
data_bbx_out_dim);
|
|
if (err)
|
|
PRINTF("failed to assign indices_len tensor: %s\n",
|
|
nn_strerror(err));
|
|
|
|
char cache_tensor_mem[NN_TENSOR_SIZEOF];
|
|
NNTensor* postprocess_cache_tensor = nn_tensor_init(cache_tensor_mem, engine);
|
|
int32_t max_cache_size = 1024 * 1024;
|
|
float* data_cache = (float*) calloc(max_cache_size, sizeof(float));
|
|
if (data_cache == NULL) {
|
|
PRINTF("failed to calloc data_cache");
|
|
free(data_bbx_out);
|
|
free(data_bbx_out_dim);
|
|
return EXIT_FAILURE;
|
|
}
|
|
|
|
int32_t shape_cache[4];
|
|
shape_cache[0] = 1;
|
|
shape_cache[1] = max_cache_size;
|
|
|
|
err = nn_tensor_assign(postprocess_cache_tensor,
|
|
NNTensorType_F32,
|
|
2,
|
|
shape_cache,
|
|
data_cache);
|
|
if (err)
|
|
PRINTF("failed to assign tensor_cache: %s\n",
|
|
nn_strerror(err));
|
|
/**
|
|
* Acquire the output tensor, will be used for reading out results of model evaluation.
|
|
*/
|
|
size_t output_index = (size_t)(nn_model_outputs(model,NULL)[0]);
|
|
NNTensor *output = nn_context_tensor_index(context, output_index);
|
|
if (!output) {
|
|
PRINTF("[ERROR] failed to retrieve output tensor\r\n");
|
|
}
|
|
|
|
for(int count = 0; count < 10; count++)
|
|
{
|
|
/**
|
|
* The nn_tensor_load_image_ex function will load the image data and attempt to
|
|
* decode it. The function supports PNG and JPEG images and the format is discovered
|
|
* by reading the buffers headers automatically. If this operation fails an error
|
|
* is returned.
|
|
*
|
|
* The final proc parameter of the _ex version of this function allows for preprocessing
|
|
* to be applied to the image as part of loading it into the input. This is useful for
|
|
* models which were trained with specific preprocessing steps but did not include them
|
|
* into the graph. A common case is normalization (x/255) and image whitening or standardization
|
|
* which are often not included in the graph but must be applied to get accurate results.
|
|
*
|
|
* proc==0 performs no pre-processing. proc&1 will perform normalization (x/255) and
|
|
* proc&2 will perform whitening. It would not be common for proc&3 to be requested.
|
|
*/
|
|
int64_t start = os_clock_now();
|
|
err = nn_tensor_load_image_ex(input, sample_image, (size_t) sample_image_size, proc);
|
|
int64_t decode_ns = os_clock_now() - start;
|
|
|
|
if (err) {
|
|
PRINTF("[ERROR] failed to load image: %s\r\n", nn_strerror(err));
|
|
}
|
|
|
|
/**
|
|
* The nn_context_run function performs the actual model evaluation. This causes all layers
|
|
* in the graph to be evaluated. If any error happens on any layer this function will return
|
|
* an error and more details might be reported to stderr depending on the cause.
|
|
*
|
|
* A common warning can be reported when insufficient cache is provided leading to performance
|
|
* degradations. These do not affect the accuracy of the results but do translate to longer
|
|
* inference times.
|
|
*/
|
|
start = os_clock_now();
|
|
err = nn_context_run(context);
|
|
int64_t run_ns = os_clock_now() - start;
|
|
|
|
if (err) {
|
|
PRINTF("[ERROR] failed to run model: %s\r\n", nn_strerror(err));
|
|
}
|
|
|
|
if(SSD_MODEL_VERSION==0) {
|
|
err = nn_ssd_decode_nms_standard_bbx(score_tensor,
|
|
trans_tensor,
|
|
anchor,
|
|
postprocess_cache_tensor,
|
|
logf(threshold/(1.0-threshold)),
|
|
nms_threshold,
|
|
max_boxes,
|
|
bbx_out_tensor,
|
|
bbx_out_dim_tensor);
|
|
}
|
|
else
|
|
{
|
|
err = nn_ssd_decode_nms_variance_bbx(prediction_tensor,
|
|
anchor,
|
|
postprocess_cache_tensor,
|
|
logf(threshold/(1.0-threshold)),
|
|
nms_threshold,
|
|
max_boxes,
|
|
bbx_out_tensor,
|
|
bbx_out_dim_tensor);
|
|
}
|
|
if (err) {
|
|
PRINTF("[ERROR] nn_ssd_decode: %s\r\n", nn_strerror(err));
|
|
}
|
|
|
|
int32_t* data_class_id = nn_tensor_aux_object_by_name(bbx_out_tensor,
|
|
"data_class_id");
|
|
float* data_score_out = (float*)nn_tensor_aux_object_by_name(bbx_out_tensor,
|
|
"data_score_out");
|
|
for (int k = 0; k < class_num-1; k++) {
|
|
const char* label = NULL;
|
|
label = nn_model_label(model, k);
|
|
PRINTF("\t Class ID = [%ld][%s] \r\n", data_class_id[k], label);
|
|
for (int i = 0; i < data_bbx_out_dim[k]; i++) {
|
|
PRINTF("\t \tPredicted bounding box[%d]: %.3f %.3f %.3f %.3f (%f)\r\n",
|
|
i,
|
|
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 0)],
|
|
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 1)],
|
|
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 2)],
|
|
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 3)],
|
|
1.0f / (1.0f + expf(-1.0*data_score_out[i+(k)*max_boxes])));
|
|
}
|
|
}
|
|
|
|
PRINTF(" decode img takes %lld us, inference takes %lld us\r\n\r\n",
|
|
decode_ns / (int) 1e3,
|
|
run_ns / (int) 1e3 );
|
|
}
|
|
|
|
#if LAYER_TIMING
|
|
/**
|
|
* This loop iterates all the layers in the model to query the output tensor.
|
|
* The tensor is in turn queried for timing information. Finally we print out
|
|
* the layer index, type, timing, and name. We ignore layers with no timing
|
|
* information to reduce console output.
|
|
*/
|
|
for (size_t i = 0; i < nn_model_layer_count(model); i++) {
|
|
NNTensor *tensor = nn_context_tensor_index(context, i);
|
|
int64_t tensor_ns = nn_tensor_time(tensor);
|
|
|
|
// Ignore layers with 0 time, such as constant layers.
|
|
if (tensor_ns == 0) continue;
|
|
int tensor_ms = (int)(tensor_ns/1e6);
|
|
const char *name = nn_model_layer_name(model, i);
|
|
const char *type = nn_model_layer_type(model, i);
|
|
|
|
PRINTF("%d: %s [%d ms] %s\r\n", i, type, tensor_ms, name);
|
|
}
|
|
#endif
|
|
free(data_bbx_out);
|
|
free(data_bbx_out_dim);
|
|
free(data_cache);
|
|
return 0;
|
|
}
|