Solve the deadlock problem caused by queuing (#13191)

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing

* Solve the deadlock problem caused by queuing,move the event to the tail by throwing a exception

Co-authored-by: wfs <wangfushun@cdqcp.cpm>

(cherry picked from commit 7a0a2c2a46)
This commit is contained in:
sssqhai 2022-12-16 19:55:02 +08:00 committed by Jay Chung
parent 19771e506f
commit bc1cf25f4d
4 changed files with 57 additions and 4 deletions

View File

@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.server.master.event;
/**
* This exception represent the exception can be recovered, when we get this exception,
* we will move the event to the fail of the queue.
*/
public class StateEventHandleFailure extends Exception {
public StateEventHandleFailure(String message) {
super(message);
}
public StateEventHandleFailure(String message, Throwable throwable) {
super(message, throwable);
}
}

View File

@ -28,9 +28,10 @@ public interface StateEventHandler {
* @param stateEvent given state event.
* @throws StateEventHandleException this exception means it can be recovered.
* @throws StateEventHandleError this exception means it cannot be recovered, so the event need to drop.
* @throws StateEventHandleException this means it can be recovered.
*/
boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent)
throws StateEventHandleException, StateEventHandleError;
boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError, StateEventHandleFailure;
StateEventType getEventType();
}

View File

@ -20,13 +20,24 @@ package org.apache.dolphinscheduler.server.master.event;
import org.apache.dolphinscheduler.common.enums.StateEventType;
import org.apache.dolphinscheduler.server.master.runner.WorkflowExecuteRunnable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.auto.service.AutoService;
@AutoService(StateEventHandler.class)
public class TaskWaitTaskGroupStateHandler implements StateEventHandler {
private static final Logger logger = LoggerFactory.getLogger(TaskWaitTaskGroupStateHandler.class);
@Override
public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent) {
return workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent);
public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
StateEvent stateEvent) throws StateEventHandleFailure {
logger.info("Handle task instance wait task group event, taskInstanceId: {}", stateEvent.getTaskInstanceId());
if (!workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) {
throw new StateEventHandleFailure("Task state event handle failed due to robing taskGroup resource failed");
}
return true;
}
@Override

View File

@ -70,6 +70,7 @@ import org.apache.dolphinscheduler.server.master.dispatch.executor.NettyExecutor
import org.apache.dolphinscheduler.server.master.event.StateEvent;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleError;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleException;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleFailure;
import org.apache.dolphinscheduler.server.master.event.StateEventHandler;
import org.apache.dolphinscheduler.server.master.event.StateEventHandlerManager;
import org.apache.dolphinscheduler.server.master.metrics.TaskMetrics;
@ -279,6 +280,13 @@ public class WorkflowExecuteRunnable implements Callable<WorkflowSubmitStatue> {
stateEvent,
stateEventHandleException);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (StateEventHandleFailure stateEventHandleFailure) {
logger.error("State event handle failed, will move event to the tail: {}",
stateEvent,
stateEventHandleFailure);
this.stateEvents.remove(stateEvent);
this.stateEvents.offer(stateEvent);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (Exception e) {
// we catch the exception here, since if the state event handle failed, the state event will still keep in the stateEvents queue.
logger.error("State event handle error, get a unknown exception, will retry this event: {}",