From 4dfa8eedb899f60c79cbc9da24d79f165b70ca39 Mon Sep 17 00:00:00 2001 From: Matri Date: Fri, 9 Aug 2024 11:05:33 +0800 Subject: [PATCH] Feat/tool-D-ID (#6278) --- .../provider/builtin/did/_assets/icon.svg | 14 ++ api/core/tools/provider/builtin/did/did.py | 21 +++ api/core/tools/provider/builtin/did/did.yaml | 28 ++++ .../tools/provider/builtin/did/did_appx.py | 87 ++++++++++++ .../provider/builtin/did/tools/animations.py | 49 +++++++ .../builtin/did/tools/animations.yaml | 86 ++++++++++++ .../tools/provider/builtin/did/tools/talks.py | 65 +++++++++ .../provider/builtin/did/tools/talks.yaml | 126 ++++++++++++++++++ 8 files changed, 476 insertions(+) create mode 100644 api/core/tools/provider/builtin/did/_assets/icon.svg create mode 100644 api/core/tools/provider/builtin/did/did.py create mode 100644 api/core/tools/provider/builtin/did/did.yaml create mode 100644 api/core/tools/provider/builtin/did/did_appx.py create mode 100644 api/core/tools/provider/builtin/did/tools/animations.py create mode 100644 api/core/tools/provider/builtin/did/tools/animations.yaml create mode 100644 api/core/tools/provider/builtin/did/tools/talks.py create mode 100644 api/core/tools/provider/builtin/did/tools/talks.yaml diff --git a/api/core/tools/provider/builtin/did/_assets/icon.svg b/api/core/tools/provider/builtin/did/_assets/icon.svg new file mode 100644 index 000000000..c477d7cb7 --- /dev/null +++ b/api/core/tools/provider/builtin/did/_assets/icon.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/api/core/tools/provider/builtin/did/did.py b/api/core/tools/provider/builtin/did/did.py new file mode 100644 index 000000000..b4bf17213 --- /dev/null +++ b/api/core/tools/provider/builtin/did/did.py @@ -0,0 +1,21 @@ +from core.tools.errors import ToolProviderCredentialValidationError +from core.tools.provider.builtin.did.tools.talks import TalksTool +from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController + + +class DIDProvider(BuiltinToolProviderController): + def _validate_credentials(self, credentials: dict) -> None: + try: + # Example validation using the D-ID talks tool + TalksTool().fork_tool_runtime( + runtime={"credentials": credentials} + ).invoke( + user_id='', + tool_parameters={ + "source_url": "https://www.d-id.com/wp-content/uploads/2023/11/Hero-image-1.png", + "text_input": "Hello, welcome to use D-ID tool in Dify", + } + ) + except Exception as e: + raise ToolProviderCredentialValidationError(str(e)) + \ No newline at end of file diff --git a/api/core/tools/provider/builtin/did/did.yaml b/api/core/tools/provider/builtin/did/did.yaml new file mode 100644 index 000000000..a70b71812 --- /dev/null +++ b/api/core/tools/provider/builtin/did/did.yaml @@ -0,0 +1,28 @@ +identity: + author: Matri Qi + name: did + label: + en_US: D-ID + description: + en_US: D-ID is a tool enabling the creation of high-quality, custom videos of Digital Humans from a single image. + icon: icon.svg + tags: + - videos +credentials_for_provider: + did_api_key: + type: secret-input + required: true + label: + en_US: D-ID API Key + placeholder: + en_US: Please input your D-ID API key + help: + en_US: Get your D-ID API key from your D-ID account settings. + url: https://studio.d-id.com/account-settings + base_url: + type: text-input + required: false + label: + en_US: D-ID server's Base URL + placeholder: + en_US: https://api.d-id.com diff --git a/api/core/tools/provider/builtin/did/did_appx.py b/api/core/tools/provider/builtin/did/did_appx.py new file mode 100644 index 000000000..964e82b72 --- /dev/null +++ b/api/core/tools/provider/builtin/did/did_appx.py @@ -0,0 +1,87 @@ +import logging +import time +from collections.abc import Mapping +from typing import Any + +import requests +from requests.exceptions import HTTPError + +logger = logging.getLogger(__name__) + + +class DIDApp: + def __init__(self, api_key: str | None = None, base_url: str | None = None): + self.api_key = api_key + self.base_url = base_url or 'https://api.d-id.com' + if not self.api_key: + raise ValueError('API key is required') + + def _prepare_headers(self, idempotency_key: str | None = None): + headers = {'Content-Type': 'application/json', 'Authorization': f'Basic {self.api_key}'} + if idempotency_key: + headers['Idempotency-Key'] = idempotency_key + return headers + + def _request( + self, + method: str, + url: str, + data: Mapping[str, Any] | None = None, + headers: Mapping[str, str] | None = None, + retries: int = 3, + backoff_factor: float = 0.3, + ) -> Mapping[str, Any] | None: + for i in range(retries): + try: + response = requests.request(method, url, json=data, headers=headers) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + if i < retries - 1 and isinstance(e, HTTPError) and e.response.status_code >= 500: + time.sleep(backoff_factor * (2**i)) + else: + raise + return None + + def talks(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs): + endpoint = f'{self.base_url}/talks' + headers = self._prepare_headers(idempotency_key) + data = kwargs['params'] + logger.debug(f'Send request to {endpoint=} body={data}') + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError('Failed to initiate D-ID talks after multiple retries') + id: str = response['id'] + if wait: + return self._monitor_job_status(id=id, target='talks', poll_interval=poll_interval) + return id + + def animations(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs): + endpoint = f'{self.base_url}/animations' + headers = self._prepare_headers(idempotency_key) + data = kwargs['params'] + logger.debug(f'Send request to {endpoint=} body={data}') + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError('Failed to initiate D-ID talks after multiple retries') + id: str = response['id'] + if wait: + return self._monitor_job_status(target='animations', id=id, poll_interval=poll_interval) + return id + + def check_did_status(self, target: str, id: str): + endpoint = f'{self.base_url}/{target}/{id}' + headers = self._prepare_headers() + response = self._request('GET', endpoint, headers=headers) + if response is None: + raise HTTPError(f'Failed to check status for talks {id} after multiple retries') + return response + + def _monitor_job_status(self, target: str, id: str, poll_interval: int): + while True: + status = self.check_did_status(target=target, id=id) + if status['status'] == 'done': + return status + elif status['status'] == 'error' or status['status'] == 'rejected': + raise HTTPError(f'Talks {id} failed: {status["status"]} {status.get("error",{}).get("description")}') + time.sleep(poll_interval) diff --git a/api/core/tools/provider/builtin/did/tools/animations.py b/api/core/tools/provider/builtin/did/tools/animations.py new file mode 100644 index 000000000..e1d9de603 --- /dev/null +++ b/api/core/tools/provider/builtin/did/tools/animations.py @@ -0,0 +1,49 @@ +import json +from typing import Any, Union + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.provider.builtin.did.did_appx import DIDApp +from core.tools.tool.builtin_tool import BuiltinTool + + +class AnimationsTool(BuiltinTool): + def _invoke( + self, user_id: str, tool_parameters: dict[str, Any] + ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url']) + + driver_expressions_str = tool_parameters.get('driver_expressions') + driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None + + config = { + 'stitch': tool_parameters.get('stitch', True), + 'mute': tool_parameters.get('mute'), + 'result_format': tool_parameters.get('result_format') or 'mp4', + } + config = {k: v for k, v in config.items() if v is not None and v != ''} + + options = { + 'source_url': tool_parameters['source_url'], + 'driver_url': tool_parameters.get('driver_url'), + 'config': config, + } + options = {k: v for k, v in options.items() if v is not None and v != ''} + + if not options.get('source_url'): + raise ValueError('Source URL is required') + + if config.get('logo_url'): + if not config.get('logo_x'): + raise ValueError('Logo X position is required when logo URL is provided') + if not config.get('logo_y'): + raise ValueError('Logo Y position is required when logo URL is provided') + + animations_result = app.animations(params=options, wait=True) + + if not isinstance(animations_result, str): + animations_result = json.dumps(animations_result, ensure_ascii=False, indent=4) + + if not animations_result: + return self.create_text_message('D-ID animations request failed.') + + return self.create_text_message(animations_result) diff --git a/api/core/tools/provider/builtin/did/tools/animations.yaml b/api/core/tools/provider/builtin/did/tools/animations.yaml new file mode 100644 index 000000000..2a2036c7b --- /dev/null +++ b/api/core/tools/provider/builtin/did/tools/animations.yaml @@ -0,0 +1,86 @@ +identity: + name: animations + author: Matri Qi + label: + en_US: Animations +description: + human: + en_US: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image. + llm: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image. +parameters: + - name: source_url + type: string + required: true + label: + en_US: source url + human_description: + en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors. + llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors. + form: llm + - name: driver_url + type: string + required: false + label: + en_US: driver url + human_description: + en_US: The URL of the driver video to drive the animation, or a provided driver name from D-ID. + form: form + - name: mute + type: boolean + required: false + label: + en_US: mute + human_description: + en_US: Mutes the driver sound in the animated video result, defaults to true + form: form + - name: stitch + type: boolean + required: false + label: + en_US: stitch + human_description: + en_US: If enabled, the driver video will be stitched with the animationing head video. + form: form + - name: logo_url + type: string + required: false + label: + en_US: logo url + human_description: + en_US: The URL of the logo image to be added to the animation video. + form: form + - name: logo_x + type: number + required: false + label: + en_US: logo position x + human_description: + en_US: The x position of the logo image in the animation video. It's required when logo url is provided. + form: form + - name: logo_y + type: number + required: false + label: + en_US: logo position y + human_description: + en_US: The y position of the logo image in the animation video. It's required when logo url is provided. + form: form + - name: result_format + type: string + default: mp4 + required: false + label: + en_US: result format + human_description: + en_US: The format of the result video. + form: form + options: + - value: mp4 + label: + en_US: mp4 + - value: gif + label: + en_US: gif + - value: mov + label: + en_US: mov diff --git a/api/core/tools/provider/builtin/did/tools/talks.py b/api/core/tools/provider/builtin/did/tools/talks.py new file mode 100644 index 000000000..06b2c4cb2 --- /dev/null +++ b/api/core/tools/provider/builtin/did/tools/talks.py @@ -0,0 +1,65 @@ +import json +from typing import Any, Union + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.provider.builtin.did.did_appx import DIDApp +from core.tools.tool.builtin_tool import BuiltinTool + + +class TalksTool(BuiltinTool): + def _invoke( + self, user_id: str, tool_parameters: dict[str, Any] + ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url']) + + driver_expressions_str = tool_parameters.get('driver_expressions') + driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None + + script = { + 'type': tool_parameters.get('script_type') or 'text', + 'input': tool_parameters.get('text_input'), + 'audio_url': tool_parameters.get('audio_url'), + 'reduce_noise': tool_parameters.get('audio_reduce_noise', False), + } + script = {k: v for k, v in script.items() if v is not None and v != ''} + config = { + 'stitch': tool_parameters.get('stitch', True), + 'sharpen': tool_parameters.get('sharpen'), + 'fluent': tool_parameters.get('fluent'), + 'result_format': tool_parameters.get('result_format') or 'mp4', + 'pad_audio': tool_parameters.get('pad_audio'), + 'driver_expressions': driver_expressions, + } + config = {k: v for k, v in config.items() if v is not None and v != ''} + + options = { + 'source_url': tool_parameters['source_url'], + 'driver_url': tool_parameters.get('driver_url'), + 'script': script, + 'config': config, + } + options = {k: v for k, v in options.items() if v is not None and v != ''} + + if not options.get('source_url'): + raise ValueError('Source URL is required') + + if script.get('type') == 'audio': + script.pop('input', None) + if not script.get('audio_url'): + raise ValueError('Audio URL is required for audio script type') + + if script.get('type') == 'text': + script.pop('audio_url', None) + script.pop('reduce_noise', None) + if not script.get('input'): + raise ValueError('Text input is required for text script type') + + talks_result = app.talks(params=options, wait=True) + + if not isinstance(talks_result, str): + talks_result = json.dumps(talks_result, ensure_ascii=False, indent=4) + + if not talks_result: + return self.create_text_message('D-ID talks request failed.') + + return self.create_text_message(talks_result) diff --git a/api/core/tools/provider/builtin/did/tools/talks.yaml b/api/core/tools/provider/builtin/did/tools/talks.yaml new file mode 100644 index 000000000..88d430512 --- /dev/null +++ b/api/core/tools/provider/builtin/did/tools/talks.yaml @@ -0,0 +1,126 @@ +identity: + name: talks + author: Matri Qi + label: + en_US: Talks +description: + human: + en_US: Talks enables the creation of realistic talking head videos from text or audio inputs. + llm: Talks enables the creation of realistic talking head videos from text or audio inputs. +parameters: + - name: source_url + type: string + required: true + label: + en_US: source url + human_description: + en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors. + llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors. + form: llm + - name: driver_url + type: string + required: false + label: + en_US: driver url + human_description: + en_US: The URL of the driver video to drive the talk, or a provided driver name from D-ID. + form: form + - name: script_type + type: string + required: false + label: + en_US: script type + human_description: + en_US: The type of the script. + form: form + options: + - value: text + label: + en_US: text + - value: audio + label: + en_US: audio + - name: text_input + type: string + required: false + label: + en_US: text input + human_description: + en_US: The text input to be spoken by the talking head. Required when script type is text. + form: form + - name: audio_url + type: string + required: false + label: + en_US: audio url + human_description: + en_US: The URL of the audio file to be spoken by the talking head. Required when script type is audio. + form: form + - name: audio_reduce_noise + type: boolean + required: false + label: + en_US: audio reduce noise + human_description: + en_US: If enabled, the audio will be processed to reduce noise before being spoken by the talking head. It only works when script type is audio. + form: form + - name: stitch + type: boolean + required: false + label: + en_US: stitch + human_description: + en_US: If enabled, the driver video will be stitched with the talking head video. + form: form + - name: sharpen + type: boolean + required: false + label: + en_US: sharpen + human_description: + en_US: If enabled, the talking head video will be sharpened. + form: form + - name: result_format + type: string + required: false + label: + en_US: result format + human_description: + en_US: The format of the result video. + form: form + options: + - value: mp4 + label: + en_US: mp4 + - value: gif + label: + en_US: gif + - value: mov + label: + en_US: mov + - name: fluent + type: boolean + required: false + label: + en_US: fluent + human_description: + en_US: Interpolate between the last & first frames of the driver video When used together with pad_audio can create a seamless transition between videos of the same driver + form: form + - name: pad_audio + type: number + required: false + label: + en_US: pad audio + human_description: + en_US: Pad the audio with silence at the end (given in seconds) Will increase the video duration & the credits it consumes + form: form + min: 1 + max: 60 + - name: driver_expressions + type: string + required: false + label: + en_US: driver expressions + human_description: + en_US: timed expressions for animation. It should be an JSON array style string. Take D-ID documentation(https://docs.d-id.com/reference/createtalk) for more information. + form: form