From f89ea3cbe3d6e326a035d602ecf34ac4659e69fe Mon Sep 17 00:00:00 2001 From: simon Date: Sat, 6 Jun 2026 17:15:55 +0200 Subject: [PATCH] Stabelized OTA Update with Retries --- main/ota_espnow.c | 151 +++++++++++++++++++++++++++++++++++++--------- main/ota_uart.c | 30 ++++++++- main/ota_uart.h | 12 +++- 3 files changed, 163 insertions(+), 30 deletions(-) diff --git a/main/ota_espnow.c b/main/ota_espnow.c index 37b0665..ffc7d20 100644 --- a/main/ota_espnow.c +++ b/main/ota_espnow.c @@ -18,9 +18,9 @@ static const char *TAG = "[OTA_ESPNOW]"; #define OTA_ESPNOW_PREPARE_PRIO 5 #define OTA_PREPARE_TIMEOUT_MS 120000u -#define OTA_BLOCK_TIMEOUT_MS 30000u +#define OTA_BLOCK_TIMEOUT_PER_SLAVE_MS 2000u +#define OTA_BLOCK_MAX_RETRIES 2u #define OTA_END_TIMEOUT_MS 60000u -#define OTA_PAYLOAD_DELAY_MS 3 #define OTA_ST_PREPARING 1u #define OTA_ST_READY 2u @@ -35,7 +35,8 @@ static const char *TAG = "[OTA_ESPNOW]"; #define OTA_MAX_TARGETS CLIENT_REGISTRY_MAX -#define OTA_SLAVE_WORK_QUEUE_LEN 12 +/** ~21 payloads per 4 KiB block; headroom for bursts + status/end. */ +#define OTA_SLAVE_WORK_QUEUE_LEN 32 #define OTA_SLAVE_WORK_STACK 8192 #define OTA_SLAVE_WORK_PRIO 5 @@ -173,6 +174,54 @@ static bool wait_target_bits(uint32_t want_bits, uint32_t timeout_ms) { return (got & want_bits) == want_bits; } +static uint32_t block_ack_timeout_ms(void) { + if (s_dist.count == 0) { + return OTA_BLOCK_TIMEOUT_PER_SLAVE_MS; + } + return (uint32_t)s_dist.count * OTA_BLOCK_TIMEOUT_PER_SLAVE_MS; +} + +static void log_missing_block_acks(uint32_t expected_bytes) { + if (s_eg == NULL || s_dist.count == 0) { + return; + } + EventBits_t bits = xEventGroupGetBits(s_eg); + for (uint8_t i = 0; i < s_dist.count; i++) { + uint32_t bit = (1u << (unsigned)i); + if (bits & bit) { + continue; + } + const ota_prog_entry_t *e = &s_prog.entries[i]; + ESP_LOGE(TAG, + "slave %lu missing block ack @%lu (last status=%lu bytes=%lu err=%lu)", + (unsigned long)s_dist.id[i], (unsigned long)expected_bytes, + (unsigned long)e->status, (unsigned long)e->bytes_written, + (unsigned long)e->error); + } +} + +static esp_err_t send_block_payloads(const uint8_t *block_buf, uint32_t block_len, + uint32_t *seq_io) { + uint32_t sent = 0; + while (sent < block_len) { + uint32_t chunk = block_len - sent; + if (chunk > OTA_UART_HOST_CHUNK_SIZE) { + chunk = OTA_UART_HOST_CHUNK_SIZE; + } + + for (uint8_t i = 0; i < s_dist.count; i++) { + esp_err_t err = esp_now_comm_send_ota_payload(s_dist.mac[i], *seq_io, + block_buf + sent, chunk); + if (err != ESP_OK) { + return err; + } + } + (*seq_io)++; + sent += chunk; + } + return ESP_OK; +} + bool ota_espnow_distribution_active(void) { return s_distribution_active; } static void send_slave_status(const uint8_t master_mac[6], uint32_t status, @@ -221,8 +270,20 @@ static void process_slave_payload(const uint8_t master_mac[6], ESP_LOGI(TAG, "ESP-NOW OTA payloads started"); } - ota_feed_result_t r = - ota_uart_feed(payload->data.bytes, payload->data.size); + ota_feed_result_t r = ota_uart_feed_chunk(payload->seq, payload->data.bytes, + payload->data.size); + if (r == OTA_FEED_SEQ_GAP) { + led_ring_ota_failed(); + send_slave_status(master_mac, OTA_ST_FAILED, ota_uart_bytes_written(), 16); + return; + } + if (r == OTA_FEED_SEQ_DUP) { + if (ota_uart_block_ready_for_reack()) { + send_slave_status(master_mac, OTA_ST_BLOCK_ACK, ota_uart_bytes_written(), + 0); + } + return; + } if (r == OTA_FEED_ERROR) { led_ring_ota_failed(); send_slave_status(master_mac, OTA_ST_FAILED, ota_uart_bytes_written(), 13); @@ -509,35 +570,71 @@ static esp_err_t distribute_image(const esp_partition_t *partition, return err; } - uint32_t sent = 0; - while (sent < block_len) { - uint32_t chunk = block_len - sent; - if (chunk > OTA_UART_HOST_CHUNK_SIZE) { - chunk = OTA_UART_HOST_CHUNK_SIZE; - } + const bool full_block = (block_len >= OTA_UART_FLASH_BLOCK_SIZE); + s_dist.expected_bytes = offset + block_len; + const uint32_t block_start_seq = seq; - for (uint8_t i = 0; i < s_dist.count; i++) { - err = esp_now_comm_send_ota_payload(s_dist.mac[i], seq, - block_buf + sent, chunk); + if (full_block) { + xEventGroupClearBits(s_eg, target_mask); + } + + bool block_sent = false; + for (uint32_t send_attempt = 0; send_attempt <= OTA_BLOCK_MAX_RETRIES; + send_attempt++) { + if (send_attempt > 0) { + seq = block_start_seq; + if (full_block) { + xEventGroupClearBits(s_eg, target_mask); + } + ESP_LOGW(TAG, "block send failed @%lu — resend %lu/%lu", + (unsigned long)s_dist.expected_bytes, + (unsigned long)send_attempt, + (unsigned long)OTA_BLOCK_MAX_RETRIES); + } + err = send_block_payloads(block_buf, block_len, &seq); + if (err == ESP_OK) { + block_sent = true; + break; + } + } + if (!block_sent) { + ESP_LOGE(TAG, "block send failed @%lu after %lu retries", + (unsigned long)s_dist.expected_bytes, + (unsigned long)OTA_BLOCK_MAX_RETRIES); + prog_end(); + s_distribution_active = false; + return err; + } + + if (full_block) { + const uint32_t ack_timeout = block_ack_timeout_ms(); + bool acked = false; + for (uint32_t attempt = 0; attempt <= OTA_BLOCK_MAX_RETRIES; attempt++) { + if (wait_target_bits(target_mask, ack_timeout)) { + acked = true; + break; + } + log_missing_block_acks(s_dist.expected_bytes); + if (attempt >= OTA_BLOCK_MAX_RETRIES) { + break; + } + ESP_LOGW(TAG, "block ack timeout @%lu — resend %lu/%lu", + (unsigned long)s_dist.expected_bytes, + (unsigned long)(attempt + 1), + (unsigned long)OTA_BLOCK_MAX_RETRIES); + xEventGroupClearBits(s_eg, target_mask); + seq = block_start_seq; + err = send_block_payloads(block_buf, block_len, &seq); if (err != ESP_OK) { prog_end(); s_distribution_active = false; return err; } } - seq++; - sent += chunk; - vTaskDelay(pdMS_TO_TICKS(OTA_PAYLOAD_DELAY_MS)); - } - - const bool full_block = (block_len >= OTA_UART_FLASH_BLOCK_SIZE); - s_dist.expected_bytes = offset + block_len; - - if (full_block) { - xEventGroupClearBits(s_eg, target_mask); - if (!wait_target_bits(target_mask, OTA_BLOCK_TIMEOUT_MS)) { - ESP_LOGE(TAG, "timeout block ack @%lu bytes", - (unsigned long)s_dist.expected_bytes); + if (!acked) { + ESP_LOGE(TAG, "timeout block ack @%lu bytes after %lu retries", + (unsigned long)s_dist.expected_bytes, + (unsigned long)OTA_BLOCK_MAX_RETRIES); prog_end(); s_distribution_active = false; return ESP_ERR_TIMEOUT; diff --git a/main/ota_uart.c b/main/ota_uart.c index 0a6a25c..965b026 100644 --- a/main/ota_uart.c +++ b/main/ota_uart.c @@ -12,6 +12,7 @@ typedef struct { uint32_t total_size; uint32_t received; uint32_t written; + uint32_t expected_seq; int target_slot; uint8_t block_buf[OTA_UART_FLASH_BLOCK_SIZE]; size_t block_len; @@ -112,10 +113,30 @@ int ota_uart_prepare(uint32_t total_size) { return s_ota.target_slot; } -ota_feed_result_t ota_uart_feed(const uint8_t *data, size_t len) { +bool ota_uart_block_ready_for_reack(void) { + if (!s_ota.active) { + return false; + } + return s_ota.written > 0 && + (s_ota.written % OTA_UART_FLASH_BLOCK_SIZE) == 0 && + s_ota.block_len == 0; +} + +ota_feed_result_t ota_uart_feed_chunk(uint32_t seq, const uint8_t *data, + size_t len) { if (!s_ota.active || data == NULL || len == 0) { return OTA_FEED_ERROR; } + if (seq < s_ota.expected_seq) { + return OTA_FEED_SEQ_DUP; + } + if (seq > s_ota.expected_seq) { + ESP_LOGW(TAG, "seq gap: got %lu expected %lu", (unsigned long)seq, + (unsigned long)s_ota.expected_seq); + return OTA_FEED_SEQ_GAP; + } + s_ota.expected_seq++; + if (len > OTA_UART_HOST_CHUNK_SIZE) { ESP_LOGW(TAG, "chunk %u > %u, truncating", (unsigned)len, OTA_UART_HOST_CHUNK_SIZE); @@ -200,6 +221,13 @@ esp_err_t ota_uart_finish(bool set_boot, bool *success_out) { return err; } + if (s_ota.total_size > 0 && s_ota.received != s_ota.total_size) { + ESP_LOGE(TAG, "size mismatch: received=%lu expected=%lu", + (unsigned long)s_ota.received, (unsigned long)s_ota.total_size); + ota_uart_abort(); + return ESP_ERR_INVALID_SIZE; + } + err = esp_ota_end(s_ota.handle); if (err != ESP_OK) { ESP_LOGE(TAG, "esp_ota_end failed: %s", esp_err_to_name(err)); diff --git a/main/ota_uart.h b/main/ota_uart.h index a086b34..8a21e70 100644 --- a/main/ota_uart.h +++ b/main/ota_uart.h @@ -28,6 +28,8 @@ typedef enum { typedef enum { OTA_FEED_OK = 0, OTA_FEED_BLOCK_WRITTEN, + OTA_FEED_SEQ_DUP, + OTA_FEED_SEQ_GAP, OTA_FEED_ERROR, } ota_feed_result_t; @@ -41,8 +43,14 @@ int ota_uart_prepare(uint32_t total_size); void ota_uart_abort(void); -/** Append up to 200 bytes; flushes 4 KiB blocks to flash when full. */ -ota_feed_result_t ota_uart_feed(const uint8_t *data, size_t len); +/** + * Append up to 200 bytes with strict seq checking (0, 1, 2, …). + * Duplicates (seq < expected) return OTA_FEED_SEQ_DUP; gaps return OTA_FEED_SEQ_GAP. + */ +ota_feed_result_t ota_uart_feed_chunk(uint32_t seq, const uint8_t *data, size_t len); + +/** True when a full 4 KiB block is in flash (used to re-ACK host block retries). */ +bool ota_uart_block_ready_for_reack(void); uint32_t ota_uart_bytes_written(void);