MCUXpresso_MIMXRT1052xxxxB/boards/evkbimxrt1050/eiq_examples/deepviewrt_image_detection/main.c
2022-08-24 23:30:23 +08:00

538 lines
21 KiB
C

/**
* Copyright 2018 Au-Zone Technologies
* All rights reserved.
*
* Software that is described herein is for illustrative purposes only which
* provides customers with programming information regarding the DeepViewRT
* library. This software is supplied "AS IS" without any warranties of any
* kind, and Au-Zone Technologies and its licensor disclaim any and all
* warranties, express or implied, including all implied warranties of
* merchantability, fitness for a particular purpose and non-infringement of
* intellectual property rights. Au-Zone Technologies assumes no responsibility
* or liability for the use of the software, conveys no license or rights under
* any patent, copyright, mask work right, or any other intellectual property
* rights in or to any products. Au-Zone Technologies reserves the right to make
* changes in the software without notification. Au-Zone Technologies also makes
* no representation or warranty that such application will be suitable for the
* specified use without further testing or modification.
*
* Permission to use, copy, modify, and distribute this software and its
* documentation requires authorization from Au-Zone Technologies which is
* available free of charge by visiting https://embeddedml.com/deepview-samples
*/
#include <stdio.h>
#include <math.h>
#include "fsl_device_registers.h"
#include "fsl_debug_console.h"
#include "clock_config.h"
#include "board_init.h"
#include "board.h"
#include <cr_section_macros.h>
#include "deepview_rt.h"
#include "deepview_ops.h"
/*******************************************************************************
* Definitions
******************************************************************************/
/**
* If LAYER_TIMING is set to 1 the main loop will dump the
* timing information for all non-constant layers.
*/
#define LAYER_TIMING 0
/**
* If LOAD_MODEL_TO_SDRAM is set to 1 then the model will be
* copied from FLASH into SDRAM before being loaded.
*/
#define LOAD_MODEL_TO_SDRAM 0
/**
* The cache is used by DeepViewRT to optimize certain internal
* loops. It is optional but if used should be placed in the
* fastest available memories, in this case we use the SRAM_DTC.
*/
#if defined(CPU_MIMXRT1176DVMAA_cm7)
#define CACHE_SIZE 256 * 1024
#else
/* 128K cache size is workable for default model */
#define CACHE_SIZE 128 * 1024
#endif
/**
* 0 - tf1.x mobilenetv1/v2-ssd
* 1 - eIQ ssd
*/
#define SSD_MODEL_VERSION 0
/**
* The mempool holds the intermediate buffers for evaluating the
* model. This buffer can be multiple megabytes in size and therefore
* should be stored in the SDRAM. You may adjust this size if your
* particular model requires more or less memory as reported by the
* conversion tool. If insufficient memory is provided an error will
* be reported by nn_context_model_load().
*/
/**
*The maximum runtime memroy size,
*adjust it according model's runtime size and board's SDRAM size.
*/
#define MEMPOOL_SIZE 5 * 1024 * 1024
/*******************************************************************************
* Prototypes
******************************************************************************/
NN_API NNError
nn_ssd_decode_nms_standard_bbx(NNTensor* score_tensor,
NNTensor* trans,
NNTensor* anchors,
NNTensor* cache,
float score_threshold,
float iou_threshold,
int32_t max_output_size,
NNTensor* bbx_out_tensor,
NNTensor* bbx_out_dim_tensor);
NN_API NNError
nn_ssd_decode_nms_variance_bbx(NNTensor* prediction,
NNTensor* anchors,
NNTensor* cache,
float score_threshold,
float iou_threshold,
int32_t max_output_size_per_class,
NNTensor* bbx_out_tensor,
NNTensor* bbx_out_dim_tensor);
/*******************************************************************************
* Variables
******************************************************************************/
#if defined ( __ICCARM__ ) /* for iar toolchain */
extern const unsigned char model_rtm_start[];
extern const unsigned char sample_img_start[];
#else
/* DeepViewRT Model definition from model.S */
extern const unsigned char model_rtm_start;
extern const unsigned char model_rtm_end;
/* Sample image definition from model.S */
extern const unsigned char sample_img_start;
extern const unsigned char sample_img_end;
#endif
/**
* The DeepViewRT Cache buffer stored in SRAM_DTC for maximum performance.
*/
//__BSS(SRAM_DTC_cm7) uint8_t cache[CACHE_SIZE] __attribute__((aligned(32)));
uint8_t *cache = (uint8_t*)(0x20000000); //Cache in DTCM; works for 1170 and 106x
/**
* The DeepViewRT Memory Pool buffer holds intermediate buffers and is
* stored in SDRAM for maximum storage space.
*/
__BSS(BOARD_SDRAM) uint8_t mempool[MEMPOOL_SIZE] __attribute__((aligned(32)));
#if LOAD_MODEL_TO_SDRAM
/**
* MEMBLOB_SIZE needs to be at least as large as the RTM model file.
*/
#define MEMBLOB_SIZE 10 * 1024 * 1024
__BSS(BOARD_SDRAM) uint8_t memblob[MEMBLOB_SIZE] __attribute__((aligned(32)));
#endif
static NNEngine* engine = NULL;
/**
* SysTick_Handler triggers every millisecond and increments the
* g_systickCounter.
*/
volatile int32_t g_systickCounter = 0;
void SysTick_Handler(void)
{
g_systickCounter++;
}
/**
* This symbol is required by DeepViewRT for internal time keeping and MUST
* return a 64-bit signed integer of continuous nanoseconds. The epoch is
* not important but the counter should never reset during model evaluation.
*
* This sample has no overflow protection so after 2^31 milliseconds it will
* wrap around. This will generally not be a problem though production software
* should have more intelligent tracking.
*/
int64_t os_clock_now()
{
return ((int64_t) g_systickCounter) * (int) 1e6;
}
/*!
* @brief Main function
*/
int main(void)
{
/* Init board hardware. */
BOARD_Init();
/**
* Initialize the SysTick to fire every milliseconds. If this is adjusted the os_clock_now()
* must also be adjusted accordingly to continue reporting nanoseconds.
*/
SysTick_Config(SystemCoreClock / 1000U);
PRINTF("==========================================================================\r\n");
PRINTF(" DeepviewRT Image Detection Demo\r\n");
PRINTF("===========================================================================\r\n");
//PRINTF("CPU: %d Hz\r\n", CLOCK_GetFreq(kCLOCK_CpuClk));
//printf("AHB: %d Hz\r\n", CLOCK_GetFreq(kCLOCK_AhbClk));
//PRINTF("SEMC: %d Hz\r\n", CLOCK_GetFreq(kCLOCK_Semc));
/**
* The model and model_size will be setup at startup based on the model_rtm_start
* and model_rtm_end variables from the model.S file.
*/
#if defined ( __ICCARM__ ) /* for iar toolchain */
const uint8_t *model = model_rtm_start;
int model_size = 8 * 1024 * 1024;
#else
const uint8_t *model_end = &model_rtm_end;
const uint8_t *model = &model_rtm_start;
int model_size = model_end - model;
#endif
if (model_size < 1) {
PRINTF("[ERROR] invalid model_size (%d) verify model.S implementation.\r\n", model_size);
return EXIT_FAILURE;
}
/**
* Just like model above, but from sample_img_start/sample_img_end.
*/
#if defined ( __ICCARM__ ) /* for iar toolchain */
const uint8_t *sample_image = sample_img_start;
int sample_image_size = 200 * 1024;
#else
const uint8_t *image_end = &sample_img_end;
const uint8_t *sample_image = &sample_img_start;
int sample_image_size = image_end - sample_image;
#endif
if (sample_image_size < 1) {
PRINTF("[ERROR] invalid sample_image_size (%d) verify model.S implementation.\r\n", sample_image_size);
return EXIT_FAILURE;
}
/**
* The NNContext structure holds runtime model data including the memory pool
* and optional cache. The first parameter is for the engine which is not used
* on MCU devices, but the same API is provided across all platforms.
*
* If MEMPOOL_SIZE is 0 then each layer's tensor will be allocated on the heap
* using malloc instead of using the optimized memory map. If MEMPOOL_SIZE is
* greater than 0 but mempool is NULL then the pool will be allocated from the
* heap and the optimized memory map WILL be used.
*
* If CACHE_SIZE is 0 then no cache will be used, convolutions especially will
* take significantly longer. If CACHE_SIZE is greater than 0 but cache is
* NULL then it will be allocated on the heap. This will provide better performance
* but will depend on the performance of heap memory, if this is SDRAM the model
* will take approximately 10x longer than if the cache is located in SRAM_DTC.
*
* If the heap is too small for the configuration context will be NULL.
*/
NNContext *context = nn_context_init(NULL,
MEMPOOL_SIZE, mempool,
CACHE_SIZE, cache);
if (!context) {
PRINTF("[ERROR] insufficient memory to create context\r\n");
return EXIT_FAILURE;
}
#if LOAD_MODEL_TO_SDRAM
if(model_size < MEMBLOB_SIZE){
memcpy(memblob,model,model_size);
model = (const uint8_t*)memblob;
PRINTF("Model loaded to SDRAM...\r\n");
} else {
PRINTF("Model too large (%d) for SDRAM buffer (%d)\r\n", model_size, MEMBLOB_SIZE);
}
#endif
/**
* Loads the model into the context. If the model is invalid because of corruption
* or alignment an error will be returned. If the provided MEMPOOL_SIZE is insufficient
* an error will also be returned. Many of these internal errors will also be logged
* through stderr.
*/
NNError err = nn_context_model_load(context, (size_t) model_size, model);
if (err) {
PRINTF("[ERROR] failed to load model: %s\r\n", nn_strerror(err));
return EXIT_FAILURE;
}
/**
* Acquire the input tensor, will be used for loading the sample image into the model.
*/
float threshold = 0.5, nms_threshold=0.6;
NNTensor *input=NULL, *anchor=NULL, *trans_tensor=NULL, *score_tensor=NULL;
NNTensor *prediction_tensor=NULL;
int class_num = 0, max_boxes = 50;
if(SSD_MODEL_VERSION==0) {
input = nn_context_tensor(context, "Preprocessor/sub");
if (!input) {
PRINTF("failed to load layer '%s' from model\n", "input");
return EXIT_FAILURE;
}
anchor = nn_context_tensor(context, "ssd_anchor_boxes");
if (!anchor) {
PRINTF("failed to load layer '%s' from model\n", "anchor");
return EXIT_FAILURE;
}
trans_tensor = nn_context_tensor(context, "concat");
if (!trans_tensor) {
PRINTF("failed to load layer '%s' from model\n", "output2s");
return EXIT_FAILURE;
}
score_tensor = nn_context_tensor(context, "concat_1");
if (!score_tensor) {
PRINTF("failed to load layer '%s' from model\n",
"score_tensor");
return EXIT_FAILURE;
}
const int32_t* score_tensor_shape = nn_tensor_shape(score_tensor);
class_num = score_tensor_shape[2];
} else {
input = nn_context_tensor(context, "input_1");
if (!input) {
PRINTF("failed to load layer '%s' from model\n", "input");
return EXIT_FAILURE;
}
anchor = nn_context_tensor(context, "ssd_anchor_boxes");
if (!anchor) {
PRINTF("failed to load layer '%s' from model\n", "anchor");
return EXIT_FAILURE;
}
prediction_tensor = nn_context_tensor(context, "Identity");
if (!prediction_tensor) {
PRINTF("failed to load layer '%s' from model\n", "output");
return EXIT_FAILURE;
}
const int32_t* prediction_tensor_shape = nn_tensor_shape(prediction_tensor);
class_num = prediction_tensor_shape[2] - 4;
}
/**
* Set imgproc mode for float input type. For quantized models imgproc is not used.
*/
uint32_t proc = nn_tensor_type(input) == NNTensorType_F32
? NN_IMAGE_PROC_SIGNED_NORM
: 0;
char bbx_out_tensor_mem[NN_TENSOR_SIZEOF];
NNTensor* bbx_out_tensor = nn_tensor_init(bbx_out_tensor_mem, engine);
float* data_bbx_out = NULL;
int32_t shape_bbx_out[4];
shape_bbx_out[0] = class_num;
shape_bbx_out[1] = max_boxes;
shape_bbx_out[2] = 4;
data_bbx_out = (float*) calloc(4 * max_boxes * class_num, sizeof(float));
if (data_bbx_out == NULL) {
PRINTF("failed to calloc data_bbx_out");
return EXIT_FAILURE;
}
err = nn_tensor_assign(bbx_out_tensor,
NNTensorType_F32,
3,
shape_bbx_out,
data_bbx_out);
if (err)
PRINTF("failed to assign bbx_out tensor: %s\n",
nn_strerror(err));
char bbx_out_dim_tensor_mem[NN_TENSOR_SIZEOF];
NNTensor* bbx_out_dim_tensor = nn_tensor_init(bbx_out_dim_tensor_mem, engine);
int32_t* data_bbx_out_dim = (int32_t*) calloc(class_num, sizeof(int32_t));
if (data_bbx_out_dim == NULL) {
PRINTF("failed to calloc data_bbx_out_dim");
free(data_bbx_out);
return EXIT_FAILURE;
}
int32_t shape_bbx_out_dim[2];
shape_bbx_out_dim[0] = class_num;
shape_bbx_out_dim[1] = 1;
err = nn_tensor_assign(bbx_out_dim_tensor,
NNTensorType_I32,
2,
shape_bbx_out_dim,
data_bbx_out_dim);
if (err)
PRINTF("failed to assign indices_len tensor: %s\n",
nn_strerror(err));
char cache_tensor_mem[NN_TENSOR_SIZEOF];
NNTensor* postprocess_cache_tensor = nn_tensor_init(cache_tensor_mem, engine);
int32_t max_cache_size = 1024 * 1024;
float* data_cache = (float*) calloc(max_cache_size, sizeof(float));
if (data_cache == NULL) {
PRINTF("failed to calloc data_cache");
free(data_bbx_out);
free(data_bbx_out_dim);
return EXIT_FAILURE;
}
int32_t shape_cache[4];
shape_cache[0] = 1;
shape_cache[1] = max_cache_size;
err = nn_tensor_assign(postprocess_cache_tensor,
NNTensorType_F32,
2,
shape_cache,
data_cache);
if (err)
PRINTF("failed to assign tensor_cache: %s\n",
nn_strerror(err));
/**
* Acquire the output tensor, will be used for reading out results of model evaluation.
*/
size_t output_index = (size_t)(nn_model_outputs(model,NULL)[0]);
NNTensor *output = nn_context_tensor_index(context, output_index);
if (!output) {
PRINTF("[ERROR] failed to retrieve output tensor\r\n");
}
for(int count = 0; count < 10; count++)
{
/**
* The nn_tensor_load_image_ex function will load the image data and attempt to
* decode it. The function supports PNG and JPEG images and the format is discovered
* by reading the buffers headers automatically. If this operation fails an error
* is returned.
*
* The final proc parameter of the _ex version of this function allows for preprocessing
* to be applied to the image as part of loading it into the input. This is useful for
* models which were trained with specific preprocessing steps but did not include them
* into the graph. A common case is normalization (x/255) and image whitening or standardization
* which are often not included in the graph but must be applied to get accurate results.
*
* proc==0 performs no pre-processing. proc&1 will perform normalization (x/255) and
* proc&2 will perform whitening. It would not be common for proc&3 to be requested.
*/
int64_t start = os_clock_now();
err = nn_tensor_load_image_ex(input, sample_image, (size_t) sample_image_size, proc);
int64_t decode_ns = os_clock_now() - start;
if (err) {
PRINTF("[ERROR] failed to load image: %s\r\n", nn_strerror(err));
}
/**
* The nn_context_run function performs the actual model evaluation. This causes all layers
* in the graph to be evaluated. If any error happens on any layer this function will return
* an error and more details might be reported to stderr depending on the cause.
*
* A common warning can be reported when insufficient cache is provided leading to performance
* degradations. These do not affect the accuracy of the results but do translate to longer
* inference times.
*/
start = os_clock_now();
err = nn_context_run(context);
int64_t run_ns = os_clock_now() - start;
if (err) {
PRINTF("[ERROR] failed to run model: %s\r\n", nn_strerror(err));
}
if(SSD_MODEL_VERSION==0) {
err = nn_ssd_decode_nms_standard_bbx(score_tensor,
trans_tensor,
anchor,
postprocess_cache_tensor,
logf(threshold/(1.0-threshold)),
nms_threshold,
max_boxes,
bbx_out_tensor,
bbx_out_dim_tensor);
}
else
{
err = nn_ssd_decode_nms_variance_bbx(prediction_tensor,
anchor,
postprocess_cache_tensor,
logf(threshold/(1.0-threshold)),
nms_threshold,
max_boxes,
bbx_out_tensor,
bbx_out_dim_tensor);
}
if (err) {
PRINTF("[ERROR] nn_ssd_decode: %s\r\n", nn_strerror(err));
}
int32_t* data_class_id = nn_tensor_aux_object_by_name(bbx_out_tensor,
"data_class_id");
float* data_score_out = (float*)nn_tensor_aux_object_by_name(bbx_out_tensor,
"data_score_out");
for (int k = 0; k < class_num-1; k++) {
const char* label = NULL;
label = nn_model_label(model, k);
PRINTF("\t Class ID = [%ld][%s] \r\n", data_class_id[k], label);
for (int i = 0; i < data_bbx_out_dim[k]; i++) {
PRINTF("\t \tPredicted bounding box[%d]: %.3f %.3f %.3f %.3f (%f)\r\n",
i,
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 0)],
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 1)],
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 2)],
data_bbx_out[nn_tensor_offsetv(bbx_out_tensor, 3, k, i, 3)],
1.0f / (1.0f + expf(-1.0*data_score_out[i+(k)*max_boxes])));
}
}
PRINTF(" decode img takes %lld us, inference takes %lld us\r\n\r\n",
decode_ns / (int) 1e3,
run_ns / (int) 1e3 );
}
#if LAYER_TIMING
/**
* This loop iterates all the layers in the model to query the output tensor.
* The tensor is in turn queried for timing information. Finally we print out
* the layer index, type, timing, and name. We ignore layers with no timing
* information to reduce console output.
*/
for (size_t i = 0; i < nn_model_layer_count(model); i++) {
NNTensor *tensor = nn_context_tensor_index(context, i);
int64_t tensor_ns = nn_tensor_time(tensor);
// Ignore layers with 0 time, such as constant layers.
if (tensor_ns == 0) continue;
int tensor_ms = (int)(tensor_ns/1e6);
const char *name = nn_model_layer_name(model, i);
const char *type = nn_model_layer_type(model, i);
PRINTF("%d: %s [%d ms] %s\r\n", i, type, tensor_ms, name);
}
#endif
free(data_bbx_out);
free(data_bbx_out_dim);
free(data_cache);
return 0;
}