Skip to content

Commit 136b326

Browse files
authored
XRT-317 add retry when calling wireserver APIs in azure mpd plugin (#3045)
1 parent dd97728 commit 136b326

File tree

3 files changed

+65
-20
lines changed

3 files changed

+65
-20
lines changed

src/runtime_src/core/pcie/driver/linux/xocl/subdev/mailbox.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ MODULE_PARM_DESC(mailbox_no_intr,
227227

228228
#define MAILBOX_TIMER (HZ / 50) /* in jiffies */
229229
#define MAILBOX_SEC2TIMER(s) ((s) * HZ / MAILBOX_TIMER)
230-
#define MSG_RX_DEFAULT_TTL 20UL /* in seconds */
230+
#define MSG_RX_DEFAULT_TTL 60UL /* in seconds */
231231
#define MSG_TX_DEFAULT_TTL 2UL /* in seconds */
232232
#define MSG_TX_PER_MB_TTL 1UL /* in seconds */
233233
#define MSG_MAX_TTL 0xFFFFFFFF /* used to disable timer */

src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.cpp

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Partial Copyright (C) 2019 Xilinx, Inc
2+
* Partial Copyright (C) 2019-2020 Xilinx, Inc
33
*
44
* Microsoft provides sample code how RESTful APIs are being called
55
*
@@ -25,6 +25,7 @@
2525
#include <syslog.h>
2626
#include <openssl/sha.h>
2727
#include <curl/curl.h>
28+
#include <sys/time.h>
2829

2930
#include <cstdio>
3031
#include <cstring>
@@ -229,6 +230,9 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer)
229230
index++;
230231
}
231232

233+
timeval tim_s, tim_e;
234+
gettimeofday(&tim_s, NULL);
235+
std::cout << fpgaSerialNumber << " : upload xclbin start @" << tim_s.tv_sec << std::endl;
232236
//start the re-image process
233237
std::string delim = ":";
234238
std::string ret, key, value;
@@ -242,6 +246,10 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer)
242246
value.compare("0") != 0)
243247
return -EFAULT;
244248

249+
gettimeofday(&tim_e, NULL);
250+
std::cout << fpgaSerialNumber << " : upload xclbin end @" << tim_e.tv_sec;
251+
std::cout << " ,takes " << tim_e.tv_sec - tim_s.tv_sec << " seconds" << std::endl;
252+
tim_s = tim_e;
245253
//check the re-image status
246254
int wait = 0;
247255
do {
@@ -251,18 +259,22 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer)
251259
fpgaSerialNumber
252260
);
253261
if (splitLine(ret, key, value, delim) != 0 ||
254-
key.compare("GetReimagingStatus") != 0)
255-
return -EFAULT;
256-
257-
if (value.compare("3") != 0) {
262+
key.compare("GetReimagingStatus") != 0) {
263+
std::cout << "Retrying GetReimagingStatus ... " << std::endl;
264+
sleep(1);
265+
wait++;
266+
continue;
267+
} else if (value.compare("3") != 0) {
258268
sleep(1);
259269
wait++;
260270
continue;
261271
} else {
262-
std::cout << "reimaging return status: " << value << " within " << wait << "s" << std::endl;
272+
gettimeofday(&tim_e, NULL);
273+
std::cout << fpgaSerialNumber << " : reimage(return status: " << value << ") end @" << tim_e.tv_sec;
274+
std::cout << " ,takes " << tim_e.tv_sec - tim_s.tv_sec << " seconds" << std::endl;
263275
return 0;
264276
}
265-
} while (wait < REIMAGE_TIMEOUT);
277+
} while (wait < rest_timeout);
266278

267279
return -ETIMEDOUT;
268280
}
@@ -311,7 +323,7 @@ int AzureDev::azureHotReset()
311323
std::cout << "getreset status return status: " << value << " within " << wait << "s" << std::endl;
312324
return 0;
313325
}
314-
} while (wait < REIMAGE_TIMEOUT);
326+
} while (wait < rest_timeout);
315327
syslog(LOG_INFO, "complete get reset status");
316328
return 0;
317329
}
@@ -339,6 +351,8 @@ int AzureDev::UploadToWireServer(
339351
CURL *curl;
340352
CURLcode res;
341353
struct write_unit unit;
354+
int retryCounter = 0;
355+
long responseCode = 0;
342356

343357
unit.uptr = data.c_str();
344358
unit.sizeleft = data.size();
@@ -377,12 +391,42 @@ int AzureDev::UploadToWireServer(
377391
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
378392

379393
//curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
380-
res = curl_easy_perform(curl);
381394

382-
if (res != CURLE_OK) {
383-
std::cerr << "curl_easy_perform() failed: " << curl_easy_strerror(res) << std::endl;
384-
return 1;
385-
}
395+
// add retry for http requests
396+
do {
397+
responseCode = 0;
398+
res = curl_easy_perform(curl);
399+
400+
if (res != CURLE_OK) {
401+
std::cerr << "curl_easy_perform() failed: " << curl_easy_strerror(res) << std::endl;
402+
if (retryCounter < upload_retry) {
403+
retryCounter++;
404+
std::cout << "Retrying an upload..." << retryCounter << std::endl;
405+
sleep(1);
406+
} else {
407+
std::cerr << "Max number of retries reached... givin up" << std::endl;
408+
curl_easy_cleanup(curl);
409+
return 1;
410+
}
411+
} else {
412+
// check the return code
413+
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responseCode);
414+
std::cout << "Debug: status code " << responseCode << std::endl;
415+
if (responseCode >= 400) { // retry for networking issue
416+
// error range
417+
res = CURLE_HTTP_RETURNED_ERROR;
418+
if (retryCounter < upload_retry) {
419+
retryCounter++;
420+
std::cout << "Retrying an upload after http error..." << retryCounter << std::endl;
421+
sleep(1);
422+
} else {
423+
std::cerr << "Max number of retries reached... givin up" << std::endl;
424+
curl_easy_cleanup(curl);
425+
return 1;
426+
}
427+
} // if (responseCode > 400)
428+
} // if (res != CURLE_OK)
429+
} while (res != CURLE_OK);
386430

387431
// cleanup
388432
curl_easy_cleanup(curl);
@@ -403,7 +447,7 @@ std::string AzureDev::REST_Get(
403447
std::string readbuff = "";
404448

405449
curl = curl_easy_init();
406-
if(curl)
450+
if (curl)
407451
{
408452
std::stringstream urlStream;
409453
urlStream << "http://" << ip << "/" << endpoint << "&chipid=" << target;
@@ -445,15 +489,15 @@ int AzureDev::Sha256AndSplit(
445489

446490
while (pos < input.size())
447491
{
448-
std::string segment = input.substr(pos, TRANSFER_SEGMENT_SIZE);
492+
std::string segment = input.substr(pos, transfer_segment_size);
449493

450494
if(!SHA256_Update(&context, segment.c_str(), segment.size()))
451495
{
452496
std::cerr << "Unable to Update SHA256 buffer" << std::endl;
453497
return 1;
454498
}
455499
output.push_back(segment);
456-
pos += TRANSFER_SEGMENT_SIZE;
500+
pos += transfer_segment_size;
457501
}
458502

459503
// Get Final SHA

src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright (C) 2019 Xilinx, Inc
2+
* Copyright (C) 2019-2020 Xilinx, Inc
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"). You may
55
* not use this file except in compliance with the License. A copy of the
@@ -89,8 +89,9 @@ class AzureDev
8989
}
9090
private:
9191
// 4 MB buffer to truncate and send
92-
static const int TRANSFER_SEGMENT_SIZE { 1024 * 4096 };
93-
static const int REIMAGE_TIMEOUT { 20 }; //in second
92+
static const int transfer_segment_size { 1024 * 4096 };
93+
static const int rest_timeout { 20 }; //in second
94+
static const int upload_retry { 15 }; //in second
9495
std::shared_ptr<pcidev::pci_device> dev;
9596
size_t index;
9697
int UploadToWireServer(

0 commit comments

Comments
 (0)