feat: Spider web scraper & crawler tool (#5725)

This commit is contained in:
William Espegren 2024-07-18 08:29:33 +02:00 committed by GitHub
parent d5dca46854
commit 588615b20e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 426 additions and 0 deletions

View File

@ -0,0 +1 @@
<svg height="30" width="30" viewBox="0 0 36 34" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" class="fill-accent-foreground transition-all group-hover:scale-110"><title>Spider v1 Logo</title><path fill-rule="evenodd" clip-rule="evenodd" d="M9.13883 7.06589V0.164429L13.0938 0.164429V6.175L14.5178 7.4346C15.577 6.68656 16.7337 6.27495 17.945 6.27495C19.1731 6.27495 20.3451 6.69807 21.4163 7.46593L22.8757 6.175V0.164429L26.8307 0.164429V7.06589V7.95679L26.1634 8.54706L24.0775 10.3922C24.3436 10.8108 24.5958 11.2563 24.8327 11.7262L26.0467 11.4215L28.6971 8.08749L31.793 10.5487L28.7257 14.407L28.3089 14.9313L27.6592 15.0944L26.2418 15.4502C26.3124 15.7082 26.3793 15.9701 26.4422 16.2355L28.653 16.6566L29.092 16.7402L29.4524 17.0045L35.3849 21.355L33.0461 24.5444L27.474 20.4581L27.0719 20.3816C27.1214 21.0613 27.147 21.7543 27.147 22.4577C27.147 22.5398 27.1466 22.6214 27.1459 22.7024L29.5889 23.7911L30.3219 24.1177L30.62 24.8629L33.6873 32.5312L30.0152 34L27.246 27.0769L26.7298 26.8469C25.5612 32.2432 22.0701 33.8808 17.945 33.8808C13.8382 33.8808 10.3598 32.2577 9.17593 26.9185L8.82034 27.0769L6.05109 34L2.37897 32.5312L5.44629 24.8629L5.74435 24.1177L6.47743 23.7911L8.74487 22.7806C8.74366 22.6739 8.74305 22.5663 8.74305 22.4577C8.74305 21.7616 8.76804 21.0758 8.81654 20.4028L8.52606 20.4581L2.95395 24.5444L0.615112 21.355L6.54761 17.0045L6.908 16.7402L7.34701 16.6566L9.44264 16.2575C9.50917 15.9756 9.5801 15.6978 9.65528 15.4242L8.34123 15.0944L7.69155 14.9313L7.27471 14.407L4.20739 10.5487L7.30328 8.08749L9.95376 11.4215L11.0697 11.7016C11.3115 11.2239 11.5692 10.7716 11.8412 10.3473L9.80612 8.54706L9.13883 7.95679V7.06589Z"></path></svg>

After

Width:  |  Height:  |  Size: 1.6 KiB

View File

@ -0,0 +1,14 @@
from typing import Any
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.spider.spiderApp import Spider
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class SpiderProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
try:
app = Spider(api_key=credentials["spider_api_key"])
app.scrape_url(url="https://spider.cloud")
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))

View File

@ -0,0 +1,27 @@
identity:
author: William Espegren
name: spider
label:
en_US: Spider
zh_CN: Spider
description:
en_US: Spider API integration, returning LLM-ready data by scraping & crawling websites.
zh_CN: Spider API 集成,通过爬取和抓取网站返回 LLM-ready 数据。
icon: icon.svg
tags:
- search
- utilities
credentials_for_provider:
spider_api_key:
type: secret-input
required: true
label:
en_US: Spider API Key
zh_CN: Spider API 密钥
placeholder:
en_US: Please input your Spider API key
zh_CN: 请输入您的 Spider API 密钥
help:
en_US: Get your Spider API key from your Spider dashboard
zh_CN: 从您的 Spider 仪表板中获取 Spider API 密钥。
url: https://spider.cloud/

View File

@ -0,0 +1,237 @@
import os
from typing import Literal, Optional, TypedDict
import requests
class RequestParamsDict(TypedDict, total=False):
url: Optional[str]
request: Optional[Literal["http", "chrome", "smart"]]
limit: Optional[int]
return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
tld: Optional[bool]
depth: Optional[int]
cache: Optional[bool]
budget: Optional[dict[str, int]]
locale: Optional[str]
cookies: Optional[str]
stealth: Optional[bool]
headers: Optional[dict[str, str]]
anti_bot: Optional[bool]
metadata: Optional[bool]
viewport: Optional[dict[str, int]]
encoding: Optional[str]
subdomains: Optional[bool]
user_agent: Optional[str]
store_data: Optional[bool]
gpt_config: Optional[list[str]]
fingerprint: Optional[bool]
storageless: Optional[bool]
readability: Optional[bool]
proxy_enabled: Optional[bool]
respect_robots: Optional[bool]
query_selector: Optional[str]
full_resources: Optional[bool]
request_timeout: Optional[int]
run_in_background: Optional[bool]
skip_config_checks: Optional[bool]
class Spider:
def __init__(self, api_key: Optional[str] = None):
"""
Initialize the Spider with an API key.
:param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
:raises ValueError: If no API key is provided.
"""
self.api_key = api_key or os.getenv("SPIDER_API_KEY")
if self.api_key is None:
raise ValueError("No API key provided")
def api_post(
self,
endpoint: str,
data: dict,
stream: bool,
content_type: str = "application/json",
):
"""
Send a POST request to the specified API endpoint.
:param endpoint: The API endpoint to which the POST request is sent.
:param data: The data (dictionary) to be sent in the POST request.
:param stream: Boolean indicating if the response should be streamed.
:return: The JSON response or the raw response stream if stream is True.
"""
headers = self._prepare_headers(content_type)
response = self._post_request(
f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream
)
if stream:
return response
elif response.status_code == 200:
return response.json()
else:
self._handle_error(response, f"post to {endpoint}")
def api_get(
self, endpoint: str, stream: bool, content_type: str = "application/json"
):
"""
Send a GET request to the specified endpoint.
:param endpoint: The API endpoint from which to retrieve data.
:return: The JSON decoded response.
"""
headers = self._prepare_headers(content_type)
response = self._get_request(
f"https://api.spider.cloud/v1/{endpoint}", headers, stream
)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, f"get from {endpoint}")
def get_credits(self):
"""
Retrieve the account's remaining credits.
:return: JSON response containing the number of credits left.
"""
return self.api_get("credits", stream=False)
def scrape_url(
self,
url: str,
params: Optional[RequestParamsDict] = None,
stream: bool = False,
content_type: str = "application/json",
):
"""
Scrape data from the specified URL.
:param url: The URL from which to scrape data.
:param params: Optional dictionary of additional parameters for the scrape request.
:return: JSON response containing the scraping results.
"""
# Add { "return_format": "markdown" } to the params if not already present
if "return_format" not in params:
params["return_format"] = "markdown"
# Set limit to 1
params["limit"] = 1
return self.api_post(
"crawl", {"url": url, **(params or {})}, stream, content_type
)
def crawl_url(
self,
url: str,
params: Optional[RequestParamsDict] = None,
stream: bool = False,
content_type: str = "application/json",
):
"""
Start crawling at the specified URL.
:param url: The URL to begin crawling.
:param params: Optional dictionary with additional parameters to customize the crawl.
:param stream: Boolean indicating if the response should be streamed. Defaults to False.
:return: JSON response or the raw response stream if streaming enabled.
"""
# Add { "return_format": "markdown" } to the params if not already present
if "return_format" not in params:
params["return_format"] = "markdown"
return self.api_post(
"crawl", {"url": url, **(params or {})}, stream, content_type
)
def links(
self,
url: str,
params: Optional[RequestParamsDict] = None,
stream: bool = False,
content_type: str = "application/json",
):
"""
Retrieve links from the specified URL.
:param url: The URL from which to extract links.
:param params: Optional parameters for the link retrieval request.
:return: JSON response containing the links.
"""
return self.api_post(
"links", {"url": url, **(params or {})}, stream, content_type
)
def extract_contacts(
self,
url: str,
params: Optional[RequestParamsDict] = None,
stream: bool = False,
content_type: str = "application/json",
):
"""
Extract contact information from the specified URL.
:param url: The URL from which to extract contact information.
:param params: Optional parameters for the contact extraction.
:return: JSON response containing extracted contact details.
"""
return self.api_post(
"pipeline/extract-contacts",
{"url": url, **(params or {})},
stream,
content_type,
)
def label(
self,
url: str,
params: Optional[RequestParamsDict] = None,
stream: bool = False,
content_type: str = "application/json",
):
"""
Apply labeling to data extracted from the specified URL.
:param url: The URL to label data from.
:param params: Optional parameters to guide the labeling process.
:return: JSON response with labeled data.
"""
return self.api_post(
"pipeline/label", {"url": url, **(params or {})}, stream, content_type
)
def _prepare_headers(self, content_type: str = "application/json"):
return {
"Content-Type": content_type,
"Authorization": f"Bearer {self.api_key}",
"User-Agent": "Spider-Client/0.0.27",
}
def _post_request(self, url: str, data, headers, stream=False):
return requests.post(url, headers=headers, json=data, stream=stream)
def _get_request(self, url: str, headers, stream=False):
return requests.get(url, headers=headers, stream=stream)
def _delete_request(self, url: str, headers, stream=False):
return requests.delete(url, headers=headers, stream=stream)
def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]:
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(
f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
)
else:
raise Exception(
f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}"
)

View File

@ -0,0 +1,47 @@
from typing import Any, Union
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.provider.builtin.spider.spiderApp import Spider
from core.tools.tool.builtin_tool import BuiltinTool
class ScrapeTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
# initialize the app object with the api key
app = Spider(api_key=self.runtime.credentials['spider_api_key'])
url = tool_parameters['url']
mode = tool_parameters['mode']
options = {
'limit': tool_parameters.get('limit', 0),
'depth': tool_parameters.get('depth', 0),
'blacklist': tool_parameters.get('blacklist', '').split(',') if tool_parameters.get('blacklist') else [],
'whitelist': tool_parameters.get('whitelist', '').split(',') if tool_parameters.get('whitelist') else [],
'readability': tool_parameters.get('readability', False),
}
result = ""
try:
if mode == 'scrape':
scrape_result = app.scrape_url(
url=url,
params=options,
)
for i in scrape_result:
result += "URL: " + i.get('url', '') + "\n"
result += "CONTENT: " + i.get('content', '') + "\n\n"
elif mode == 'crawl':
crawl_result = app.crawl_url(
url=tool_parameters['url'],
params=options,
)
for i in crawl_result:
result += "URL: " + i.get('url', '') + "\n"
result += "CONTENT: " + i.get('content', '') + "\n\n"
except Exception as e:
return self.create_text_message("An error occured", str(e))
return self.create_text_message(result)

View File

@ -0,0 +1,100 @@
identity:
name: scraper_crawler
author: William Espegren
label:
en_US: Web Scraper & Crawler
zh_Hans: 网页抓取与爬虫
description:
human:
en_US: A tool for scraping & crawling webpages. Input should be a url.
zh_Hans: 用于抓取和爬取网页的工具。输入应该是一个网址。
llm: A tool for scraping & crawling webpages. Input should be a url.
parameters:
- name: url
type: string
required: true
label:
en_US: URL
zh_Hans: 网址
human_description:
en_US: url to be scraped or crawled
zh_Hans: 要抓取或爬取的网址
llm_description: url to either be scraped or crawled
form: llm
- name: mode
type: select
required: true
options:
- value: scrape
label:
en_US: scrape
zh_Hans: 抓取
- value: crawl
label:
en_US: crawl
zh_Hans: 爬取
default: crawl
label:
en_US: Mode
zh_Hans: 模式
human_description:
en_US: used for selecting to either scrape the website or crawl the entire website following subpages
zh_Hans: 用于选择抓取网站或爬取整个网站及其子页面
form: form
- name: limit
type: number
required: false
label:
en_US: maximum number of pages to crawl
zh_Hans: 最大爬取页面数
human_description:
en_US: specify the maximum number of pages to crawl per website. the crawler will stop after reaching this limit.
zh_Hans: 指定每个网站要爬取的最大页面数。爬虫将在达到此限制后停止。
form: form
min: 0
default: 0
- name: depth
type: number
required: false
label:
en_US: maximum depth of pages to crawl
zh_Hans: 最大爬取深度
human_description:
en_US: the crawl limit for maximum depth.
zh_Hans: 最大爬取深度的限制。
form: form
min: 0
default: 0
- name: blacklist
type: string
required: false
label:
en_US: url patterns to exclude
zh_Hans: 要排除的URL模式
human_description:
en_US: blacklist a set of paths that you do not want to crawl. you can use regex patterns to help with the list.
zh_Hans: 指定一组不想爬取的路径。您可以使用正则表达式模式来帮助定义列表。
placeholder: /blog/*, /about
form: form
- name: whitelist
type: string
required: false
label:
en_US: URL patterns to include
zh_Hans: 要包含的URL模式
human_description:
en_US: Whitelist a set of paths that you want to crawl, ignoring all other routes that do not match the patterns. You can use regex patterns to help with the list.
zh_Hans: 指定一组要爬取的路径,忽略所有不匹配模式的其他路由。您可以使用正则表达式模式来帮助定义列表。
placeholder: /blog/*, /about
form: form
- name: readability
type: boolean
required: false
label:
en_US: Pre-process the content for LLM usage
zh_Hans: 仅返回页面的主要内容
human_description:
en_US: Use Mozilla's readability to pre-process the content for reading. This may drastically improve the content for LLM usage.
zh_Hans: 如果启用,爬虫将仅返回页面的主要内容,不包括标题、导航、页脚等。
form: form
default: false