Skip to content

Commit 46b6cbe

Browse files
committed
keep track of number of forsaken attempts
1 parent aa00d66 commit 46b6cbe

File tree

6 files changed

+48
-18
lines changed

6 files changed

+48
-18
lines changed

taskvine/src/bindings/python3/ndcctools/taskvine/task.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,10 +423,18 @@ def add_environment(self, f):
423423
# Indicate the number of times the task should be retried. If 0 (the
424424
# default), the task is tried indefinitely. A task that did not succeed
425425
# after the given number of retries is returned with result
426-
# "result_max_retries".
426+
# "max retries".
427427
def set_retries(self, max_retries):
428428
return cvine.vine_task_set_retries(self._task, max_retries)
429429

430+
##
431+
# Indicate the number of times the task can be returned to the manager
432+
# without being executed. If 0 default), the task is tried indefinitely.
433+
# A task that did not succeed after the given number of retries is returned
434+
# with result "forsaken".
435+
def set_max_forsaken(self, max_forsaken):
436+
return cvine.vine_task_set_max_forsaken(self._task, max_forsaken)
437+
430438
##
431439
# Indicate the number of cores required by this task.
432440
def set_cores(self, cores):

taskvine/src/manager/taskvine.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,13 +285,20 @@ int vine_task_add_input( struct vine_task *t, struct vine_file *f, const char *r
285285

286286
int vine_task_add_output( struct vine_task *t, struct vine_file *f, const char *remote_name, vine_mount_flags_t flags );
287287

288-
/** Specify the number of times this task is retried on worker errors. If less than one, the task is retried indefinitely (this the default). A task that did not succeed after the given number of retries is returned with result VINE_RESULT_MAX_RETRIES.
288+
/** Specify the number of times this task is retried on worker errors. If less than one, the task is retried indefinitely (this the default). A task that did not succeed after the given number of retries is returned with the result of its last attempt.
289289
@param t A task object.
290290
@param max_retries The number of retries.
291291
*/
292292

293293
void vine_task_set_retries( struct vine_task *t, int64_t max_retries );
294294

295+
/** Specify the total number of times this task can be return to the manager without being executed. If less than zero, the task is tried indefinitely (this the default). A task that did not succeed after the given number is returned with the result VINE_RESULT_FORSAKEN.
296+
@param t A task object.
297+
@param max_retries The number of retries.
298+
*/
299+
300+
void vine_task_set_max_forsaken( struct vine_task *t, int64_t max_forsaken );
301+
295302
/** Specify the amount of disk space required by a task.
296303
@param t A task object.
297304
@param memory The amount of disk space required by the task, in megabytes.

taskvine/src/manager/vine_manager.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,6 +1538,7 @@ static vine_result_code_t get_result(struct vine_manager *q, struct vine_worker_
15381538

15391539
/* If the task was forsaken by the worker or couldn't exeute, it didn't really complete, so short circuit. */
15401540
if (task_status == VINE_RESULT_FORSAKEN) {
1541+
t->forsaken_count++;
15411542
itable_remove(q->running_table, t->task_id);
15421543
vine_task_set_result(t, task_status);
15431544
change_task_state(q, t, VINE_TASK_WAITING_RETRIEVAL);
@@ -2795,10 +2796,12 @@ static int resubmit_task_on_exhaustion(struct vine_manager *q, struct vine_worke
27952796
static int resubmit_if_needed(struct vine_manager *q, struct vine_worker_info *w, struct vine_task *t)
27962797
{
27972798
/* in this function, any change_task_state should only be to VINE_TASK_READY */
2798-
27992799
if (t->result == VINE_RESULT_FORSAKEN) {
2800-
/* forsaken tasks are always resubmitted. they also get a retry back as they are victims of circumstance
2801-
*/
2800+
if (t->max_forsaken > -1 && t->forsaken_count > t->max_forsaken) {
2801+
return 0;
2802+
}
2803+
2804+
/* forsaken tasks get a retry back as they are victims of circumstance */
28022805
t->try_count -= 1;
28032806
change_task_state(q, t, VINE_TASK_READY);
28042807
return 1;
@@ -2810,8 +2813,7 @@ static int resubmit_if_needed(struct vine_manager *q, struct vine_worker_info *w
28102813
}
28112814

28122815
/* special handlings per result. note that most results are terminal, that is tasks are not retried even if they
2813-
* have not reached max_retries. max_retries is only used for transient errors, or for modified tasks (such as a
2814-
* change in the resource request). */
2816+
* have not reached max_retries. */
28152817
switch (t->result) {
28162818
case VINE_RESULT_RESOURCE_EXHAUSTION:
28172819
return resubmit_task_on_exhaustion(q, w, t);
@@ -3125,7 +3127,6 @@ static int send_one_task(struct vine_manager *q)
31253127

31263128
timestamp_t now_usecs = timestamp_get();
31273129
double now_secs = ((double)now_usecs) / ONE_SECOND;
3128-
timestamp_t time_failure_range = now_usecs - q->transient_error_interval;
31293130

31303131
int tasks_to_consider = MIN(list_size(q->ready_list), q->attempt_schedule_depth);
31313132

@@ -3140,7 +3141,7 @@ static int send_one_task(struct vine_manager *q)
31403141
}
31413142

31423143
// Skip if this task failed recently
3143-
if (time_failure_range > t->time_when_last_failure) {
3144+
if (t->time_when_last_failure + q->transient_error_interval > now_usecs) {
31443145
continue;
31453146
}
31463147

taskvine/src/manager/vine_task.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ void vine_task_reset(struct vine_task *t)
129129

130130
t->resource_request = CATEGORY_ALLOCATION_FIRST;
131131
t->try_count = 0;
132+
t->forsaken_count = 0;
132133
t->exhausted_attempts = 0;
133134
t->workers_slow = 0;
134135

@@ -226,6 +227,7 @@ struct vine_task *vine_task_copy(const struct vine_task *task)
226227
vine_task_set_scheduler(new, task->worker_selection_algorithm);
227228
vine_task_set_priority(new, task->priority);
228229
vine_task_set_retries(new, task->max_retries);
230+
vine_task_set_max_forsaken(new, task->max_forsaken);
229231
vine_task_set_time_min(new, task->min_running_time);
230232

231233
/* Internal state of task is cleared from vine_task_create */
@@ -306,6 +308,15 @@ void vine_task_set_retries(struct vine_task *t, int64_t max_retries)
306308
}
307309
}
308310

311+
void vine_task_set_max_forsaken(struct vine_task *t, int64_t max_forsaken)
312+
{
313+
if (max_forsaken < 0) {
314+
t->max_retries = -1;
315+
} else {
316+
t->max_forsaken = max_forsaken;
317+
}
318+
}
319+
309320
void vine_task_set_memory(struct vine_task *t, int64_t memory)
310321
{
311322
if (memory < 0) {

taskvine/src/manager/vine_task.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,20 @@ struct vine_task {
6161
vine_schedule_t worker_selection_algorithm; /**< How to choose worker to run the task. */
6262
double priority; /**< The priority of this task relative to others in the queue: higher number run earlier. */
6363
int max_retries; /**< Number of times the task is tried to be executed on some workers until success. If less than one, the task is retried indefinitely. See try_count below.*/
64+
int max_forsaken; /**< Number of times the task is submitted to workers without being executed. If less than one, the task is retried indefinitely. See forsaken_count below.*/
6465
int64_t min_running_time; /**< Minimum time (in seconds) the task needs to run. (see vine_worker --wall-time)*/
6566

6667
/***** Internal state of task as it works towards completion. *****/
6768

6869
vine_task_state_t state; /**< Current state of task: READY, RUNNING, etc */
6970
struct vine_worker_info *worker; /**< Worker to which this task has been dispatched. */
7071
struct vine_task* library_task; /**< Library task to which a function task has been matched. */
71-
int try_count; /**< The number of times the task has been dispatched to a worker. If larger than max_retries, the task failes with @ref VINE_RESULT_MAX_RETRIES. */
72-
int exhausted_attempts; /**< Number of times the task failed given exhausted resources. */
73-
int workers_slow; /**< Number of times this task has been terminated for running too long. */
74-
int function_slots_inuse; /**< If a library, the number of functions currently running. */
72+
int try_count; /**< The number of times the task has been dispatched to a worker without being forsaken. If larger than max_retries, return with result of last attempt. */
73+
int forsaken_count; /**< The number of times the task has been dispatched to a worker. If larger than max_forsaken, return with VINE_RESULT_FORSAKEN. */
74+
int exhausted_attempts; /**< Number of times the task failed given exhausted resources. */
75+
int forsaken_attempts; /**< Number of times the task was submitted to a worker but failed to start execution. */
76+
int workers_slow; /**< Number of times this task has been terminated for running too long. */
77+
int function_slots_inuse; /**< If a library, the number of functions currently running. */
7578

7679
/***** Results of task once it has reached completion. *****/
7780

taskvine/test/vine_python.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def next_output_name():
203203
report_task(t, "success", 0)
204204

205205
# should fail in the alloted time
206-
t = vine.Task("/bin/sleep 10")
206+
t = vine.Task("/bin/sleep 100")
207207
t.set_time_max(1)
208208
q.submit(t)
209209
t = q.wait(wait_time)
@@ -266,13 +266,13 @@ def next_output_name():
266266
report_task(t, "success", 0)
267267

268268
# generate an invalid remote input file, should get an input missing error.
269-
t = vine.Task("wc -l infile")
270-
t.set_retries(1)
269+
t = vine.Task("wc -l infile_for_forsaken")
270+
t.set_max_forsaken(1)
271271
url = q.declare_url("https://pretty-sure-this-is-not-a-valid-url.com")
272-
t.add_input(url, "infile")
272+
t.add_input(url, "infile_for_forsaken")
273273
q.submit(t)
274274
t = q.wait(wait_time)
275-
report_task(t, "transfer missing", 1)
275+
report_task(t, "forsaken", -1)
276276

277277
# create a temporary output file, and then fetch its contents manually.
278278
t = vine.Task("echo howdy > output")

0 commit comments

Comments
 (0)